In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os

In [2]:
path = '../data'

In [3]:
train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
structures = pd.read_csv(f'{path}/structures.csv')

In [4]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [5]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values

test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

In [6]:
JHC_train = train[train.type == '1JHC'].copy()
JHC_test = test[test.type == '1JHC'].copy()

In [7]:
JHC_train[JHC_train.dist > 1.065].scalar_coupling_constant.describe()

count    700156.000000
mean         93.636621
std          14.175342
min          66.600800
25%          83.974400
50%          88.070900
75%          99.365150
max         171.220000
Name: scalar_coupling_constant, dtype: float64

In [8]:
JHC_train[JHC_train.dist <= 1.065].scalar_coupling_constant.describe()

count    9260.000000
mean      196.259228
std         2.279946
min       188.507000
25%       194.837000
50%       196.358000
75%       197.735250
max       204.880000
Name: scalar_coupling_constant, dtype: float64

In [9]:
JHC_0_train_id = JHC_train[JHC_train.dist > 1.065].id
JHC_1_train_id = JHC_train[JHC_train.dist <= 1.065].id

JHC_0_test_id = JHC_test[JHC_test.dist > 1.065].id
JHC_1_test_id = JHC_test[JHC_test.dist <= 1.065].id

In [10]:
len(JHC_0_train_id), len(JHC_1_train_id), len(JHC_0_test_id), len(JHC_1_test_id)

(700156, 9260, 375752, 4857)

In [11]:
train_orig = pd.read_csv(f'{path}/train.csv')
test_orig = pd.read_csv(f'{path}/test.csv')

In [12]:
train_split = train_orig.copy()
test_split = test_orig.copy()

In [13]:
train_split.loc[train_split['id'].isin(JHC_0_train_id), 'type'] = '1JHC_0'
train_split.loc[train_split['id'].isin(JHC_1_train_id), 'type'] = '1JHC_1'

test_split.loc[test_split['id'].isin(JHC_0_test_id), 'type'] = '1JHC_0'
test_split.loc[test_split['id'].isin(JHC_1_test_id), 'type'] = '1JHC_1'

In [14]:
(len(train_split[train_split.type=='1JHC_0']), len(train_split[train_split.type=='1JHC_1']), 
len(test_split[test_split.type=='1JHC_0']), len(test_split[test_split.type=='1JHC_1']))

(700156, 9260, 375752, 4857)

In [15]:
train_split.to_csv(f'{path}/train_1JHC_split.csv', index=False)
test_split.to_csv(f'{path}/test_1JHC_split.csv', index=False)

In [16]:
pd.read_csv(f'{path}/train_1JHC_split.csv').head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC_0,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC_0,84.8074


In [17]:
pd.read_csv(f'{path}/test_1JHC_split.csv').head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC_1
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC_1
4,4658151,dsgdb9nsd_000004,3,1,2JHC
