# Plan

* read data
* transform it to format suitable for [graph_bet](https://github.com/deepmind/graph_nets/blob/master/graph_nets/graphs.py)
* make regression MLP & launch training

In [13]:
import pandas as pd
import numpy as np

from utils import load_csv_dataframe

In [2]:
train = load_csv_dataframe('input/train.csv.zip')
test = load_csv_dataframe('input/test.csv.zip')
sub = load_csv_dataframe('input/sample_submission.csv.zip')
struct = load_csv_dataframe('input/structures.csv.zip')

Memory usage of dataframe is 213.23 MB
Memory usage after optimization is: 60.90 MB
Decreased by 71.4%
Memory usage of dataframe is 95.58 MB
Memory usage after optimization is: 27.88 MB
Decreased by 70.8%
Memory usage of dataframe is 38.23 MB
Memory usage after optimization is: 11.95 MB
Decreased by 68.7%
Memory usage of dataframe is 107.97 MB
Memory usage after optimization is: 32.99 MB
Decreased by 69.4%


In [3]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125


In [4]:
struct["node_index"] = struct.index
struct.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,node_index
0,dsgdb9nsd_000001,0,C,-0.012695,1.085938,0.008003,0
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,1
2,dsgdb9nsd_000001,2,H,1.011719,1.463867,0.000277,2
3,dsgdb9nsd_000001,3,H,-0.541016,1.447266,-0.876465,3
4,dsgdb9nsd_000001,4,H,-0.523926,1.4375,0.90625,4


In [5]:
intermediate = pd.merge(train, struct,  how='left', left_on=['molecule_name', 'atom_index_0'], 
                        right_on = ['molecule_name', 'atom_index'], suffixes=('', '_0'))
intermediate = intermediate.drop("atom_index", axis=1)
intermediate.columns = [c if c not in struct.columns or c == 'molecule_name'  else c + "_0" 
                        for c in intermediate.columns ]
intermediate = pd.merge(intermediate, struct,  how='left', left_on=['molecule_name', 'atom_index_1'], 
                        right_on = ['molecule_name', 'atom_index'])
intermediate = intermediate.drop("atom_index", axis=1)
intermediate.columns = [c if c not in struct.columns or c == 'molecule_name'  else c + "_1" 
                        for c in intermediate.columns ]

intermediate.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,node_index_0,atom_1,x_1,y_1,z_1,node_index_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,1,C,-0.012695,1.085938,0.008003,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1,H,1.011719,1.463867,0.000277,2
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1,H,-0.541016,1.447266,-0.876465,3
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1,H,-0.523926,1.4375,0.90625,4
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,2,C,-0.012695,1.085938,0.008003,0


In [14]:
def add_column(df, column_name, postrfix):
    df[column_name + "_" + postrfix] = np.zeros(intermediate.shape[0])

for atom in ["H", "C", "N"]:
    add_column(intermediate, "atom_0", atom)
    intermediate["atom_0" + "_" + atom][intermediate["atom_0"] == atom] = 1
    add_column(intermediate, "atom_1", atom)
    intermediate["atom_1" + "_" + atom][intermediate["atom_1"] == atom] = 1
intermediate.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,...,x_1,y_1,z_1,node_index_1,atom_0_H,atom_1_H,atom_0_C,atom_1_C,atom_0_N,atom_1_N
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,...,-0.012695,1.085938,0.008003,0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,1.011719,1.463867,0.000277,2,1.0,1.0,0.0,0.0,0.0,0.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,-0.541016,1.447266,-0.876465,3,1.0,1.0,0.0,0.0,0.0,0.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,-0.523926,1.4375,0.90625,4,1.0,1.0,0.0,0.0,0.0,0.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,...,-0.012695,1.085938,0.008003,0,1.0,0.0,0.0,1.0,0.0,0.0


In [10]:
len(intermediate["atom_1"].unique())
pd.unique(intermediate[["atom_1", "atom_0"]].values.ravel('K'))

array(['C', 'H', 'N'], dtype=object)

In [16]:
for edge_type in intermediate["type"].unique().tolist():
    add_column(intermediate, "type", edge_type)
    intermediate["type" + "_" + edge_type][intermediate["type"] == edge_type] = 1
intermediate.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,...,atom_0_N,atom_1_N,type_1JHC,type_2JHH,type_1JHN,type_2JHN,type_2JHC,type_3JHH,type_3JHC,type_3JHN
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.preprocessing import OrdinalEncoder


node_index = {}
node_features = []
for i, row in struct.iterrows():
    node_index[(row.molecule_name, row.atom_index)] = i

# TODO: 
* finish extraction of basic features for GNN
* modify basic GNN for regression problem

In [None]:
"""
  - N_NODE: The number of nodes per graph. It is a vector of integers with shape
    `[n_graphs]`, such that `graph.N_NODE[i]` is the number of nodes in the i-th
    graph.
  - N_EDGE: The number of edges per graph. It is a vector of integers with shape
    `[n_graphs]`, such that `graph.N_NODE[i]` is the number of edges in the i-th
    graph.
  - NODES: The nodes features. It is either `None` (the graph has no node
    features), or a vector of shape `[n_nodes] + node_shape`, where
    `n_nodes = sum(graph.N_NODE)` is the total number of nodes in the batch of
    graphs, and `node_shape` represents the shape of the features of each node.
    The relative index of a node from the batched version can be recovered from
    the `graph.N_NODE` property. For instance, the second node of the third
    graph will have its features in the
    `1 + graph.N_NODE[0] + graph.N_NODE[1]`-th slot of graph.NODES.
    Observe that having a `None` value for this field does not mean that the
    graphs have no nodes, only that they do not have features.
  - EDGES: The edges features. It is either `None` (the graph has no edge
    features), or a vector of shape `[n_edges] + edge_shape`, where
    `n_edges = sum(graph.N_EDGE)` is the total number of edges in the batch of
    graphs, and `edge_shape` represents the shape of the features of each edge.
    The relative index of an edge from the batched version can be recovered from
    the `graph.N_EDGE` property. For instance, the third edge of the third
    graph will have its features in the `2 + graph.N_EDGE[0] + graph.N_EDGE[1]`-
    th slot of graph.EDGES.
    Observe that having a `None` value for this field does not necessarily mean
    that the graph has no edges, only that they do not have features.
  - RECEIVERS: The indices of the receiver nodes, for each edge. It is either
    `None` (if the graph has no edges), or a vector of integers of shape
    `[n_edges]`, such that `graph.RECEIVERS[i]` is the index of the node
    receiving from the i-th edge.
    Observe that the index is absolute (in other words, cumulative), i.e.
    `graphs.RECEIVERS` take value in `[0, n_nodes]`. For instance, an edge
    connecting the vertices with relative indices 2 and 3 in the second graph of
    the batch would have a `RECEIVERS` value of `3 + graph.N_NODE[0]`.
    If `graphs.RECEIVERS` is `None`, then `graphs.EDGES` and `graphs.SENDERS`
    should also be `None`.
  - SENDERS
"""
n_nodes = struct.shape[0]
n_edge = intermediate.shape[0]
