In [2]:
import numpy as np
import pandas as pd
np.random.seed(55)
# pd.options.display.precision = 15

In [4]:
%%time

input_dir = '../input'
# input_dir = '../work/subsample_10000'

# output_dir = '.'
output_dir = '../work'

train = pd.read_csv(input_dir + '/train.csv')
test = pd.read_csv(input_dir + '/test.csv')
sub = pd.read_csv(input_dir + '/sample_submission.csv')
structures = pd.read_csv(input_dir + '/structures.csv')
contributions = pd.read_csv(input_dir + '/scalar_coupling_contributions.csv')

print('Train dataset shape is now rows: {} cols:{}'.format(train.shape[0],train.shape[1]))
print('Test dataset shape is now rows: {} cols:{}'.format(test.shape[0],test.shape[1]))
print('Sub dataset shape is now rows: {} cols:{}'.format(sub.shape[0],sub.shape[1]))
print('Structures dataset shape is now rows: {} cols:{}'.format(structures.shape[0],structures.shape[1]))
print('Scalar_coupling_contributions dataset shape is now rows: {} cols:{}'.format(contributions.shape[0],
                                                                                   contributions.shape[1]))

Train dataset shape is now rows: 4658147 cols:6
Test dataset shape is now rows: 2505542 cols:5
Sub dataset shape is now rows: 2505542 cols:2
Structures dataset shape is now rows: 2358657 cols:6
Scalar_coupling_contributions dataset shape is now rows: 4658147 cols:8
CPU times: user 7.24 s, sys: 578 ms, total: 7.81 s
Wall time: 7.81 s


In [13]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how='left',
                  left_on=['molecule_name', f'atom_index_{atom_idx}'],
                  right_on=['molecule_name', 'atom_index'])

    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

#######################
### Create features ###
#######################

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [14]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,H,1.011731,1.463751,0.000277,H,-0.540815,1.447527,-0.876644
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,H,1.011731,1.463751,0.000277,H,-0.523814,1.437933,0.906397
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,C,-0.012698,1.085804,0.008001
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,H,-0.540815,1.447527,-0.876644,H,-0.523814,1.437933,0.906397
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,C,-0.012698,1.085804,0.008001


In [18]:
train.groupby(['type', 'atom_0', 'atom_1'])['id'].count()

type  atom_0  atom_1
1JHC  H       C          709416
1JHN  H       N           43363
2JHC  H       C         1140674
2JHH  H       H          378036
2JHN  H       N          119253
3JHC  H       C         1510379
3JHH  H       H          590611
3JHN  H       N          166415
Name: id, dtype: int64

In [19]:
test.groupby(['type', 'atom_0', 'atom_1'])['id'].count()

type  atom_0  atom_1
1JHC  H       C         380609
1JHN  H       N          24195
2JHC  H       C         613138
2JHH  H       H         203126
2JHN  H       N          64424
3JHC  H       C         811999
3JHH  H       H         317435
3JHN  H       N          90616
Name: id, dtype: int64

In [20]:
# Andrew Lukyanenko :)

train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])
train['type_1'] = train['type'].apply(lambda x: x[1:])
test['type_1'] = test['type'].apply(lambda x: x[1:])

train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,type_0,type_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1,JHC
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,2,JHH
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,2,JHH
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,2,JHH
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1,JHC
