In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max.rows', 150)
pd.set_option('display.max.columns', 150)

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
import gc

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')
structures = pd.read_csv('structures.csv')
scalar_coupling_contributions = pd.read_csv('scalar_coupling_contributions.csv')

print('Train dataset shape is -> rows: {} cols:{}'.format(train.shape[0],train.shape[1]))
print('Test dataset shape is  -> rows: {} cols:{}'.format(test.shape[0],test.shape[1]))
print('Sub dataset shape is  -> rows: {} cols:{}'.format(sub.shape[0],sub.shape[1]))
print('Structures dataset shape is  -> rows: {} cols:{}'.format(structures.shape[0],structures.shape[1]))
print('Scalar_coupling_contributions dataset shape is  -> rows: {} cols:{}'.format(scalar_coupling_contributions.shape[0],
                                                                                   scalar_coupling_contributions.shape[1]))

Train dataset shape is -> rows: 4658147 cols:6
Test dataset shape is  -> rows: 2505542 cols:5
Sub dataset shape is  -> rows: 2505542 cols:2
Structures dataset shape is  -> rows: 2358657 cols:6
Scalar_coupling_contributions dataset shape is  -> rows: 4658147 cols:8


In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

For an fast model/feature evaluation, get only 10% of dataset. Final submission must remove/coments this code

In [7]:
n_estimators_default = 4000

The importante things to know is that the scalar coupling constants in train.csv are a sum of four terms. 
```
* fc is the Fermi Contact contribution
* sd is the Spin-dipolar contribution
* pso is the Paramagnetic spin-orbit contribution
* dso is the Diamagnetic spin-orbit contribution. 
```
Let's merge this into train

In [8]:
train = pd.merge(train, scalar_coupling_contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [9]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,-11.0317,0.352932,2.85856,-3.43395
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,-11.0324,0.352943,2.85853,-3.43387
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,83.0241,0.254634,1.25856,0.272012
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,-11.0319,0.352943,2.85856,-3.43393
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,83.0243,0.254628,1.25856,0.272012


In [10]:
test.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC
5,4658152,dsgdb9nsd_000015,3,0,1JHC
6,4658153,dsgdb9nsd_000015,3,2,3JHC
7,4658154,dsgdb9nsd_000015,3,4,2JHH
8,4658155,dsgdb9nsd_000015,3,5,2JHH
9,4658156,dsgdb9nsd_000015,4,0,1JHC


In [11]:
scalar_coupling_contributions.head(5)

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


`train['scalar_coupling_constant'] and scalar_coupling_contributions['fc']` quite similar

In [12]:
pd.concat(objs=[train['scalar_coupling_constant'],scalar_coupling_contributions['fc'] ],axis=1)[:10]

Unnamed: 0,scalar_coupling_constant,fc
0,84.8076,83.0224
1,-11.257,-11.0347
2,-11.2548,-11.0325
3,-11.2543,-11.0319
4,84.8074,83.0222
5,-11.2541,-11.0317
6,-11.2548,-11.0324
7,84.8093,83.0241
8,-11.2543,-11.0319
9,84.8095,83.0243


Based in others ideais we can:<br>

- train a model to predict `fc` feature;
- add this feature to train and test and train the same model to compare performance;
- train a better model;

<a id="id4"></a> <br> 
# **4. Data Pre-processing** 

## Feature generation

I use this great kernel to get x,y,z position. https://www.kaggle.com/seriousran/just-speed-up-calculate-distance-from-benchmark

In [13]:
from tqdm import tqdm_notebook as tqdm
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

fudge_factor = 0.05
atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
print(atomic_radius)

electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

#structures = pd.read_csv(structures, dtype={'atom_index':np.int8})

atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

display(structures.head())

{'H': 0.43, 'C': 0.8200000000000001, 'N': 0.8, 'O': 0.78, 'F': 0.76}


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43


### Chemical Bond Calculation

In [14]:
i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

print('Calculating bonds')

for i in tqdm(range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

print('Counting and condensing bonds')

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
bond_lengths_std = [ np.std(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

#bond_data = {'bond_' + str(i):col for i, col in enumerate(np.transpose(bonds))}
#bond_data.update({'bonds_numeric':bonds_numeric, 'n_bonds':n_bonds})

bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean,'bond_lengths_std':bond_lengths_std }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)
display(structures.head(20))

Calculating bonds


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


Counting and condensing bonds


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,n_bonds,bond_lengths_mean,bond_lengths_std
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,3e-06
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,0.0
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,0.0
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,0.0
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,3,1.017195,9e-06
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,1,1.01719,0.0
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,1,1.017187,0.0
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,1,1.017208,0.0
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,2,0.962107,0.0


In [15]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    #df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [16]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,0.0,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,3e-06
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,0.0,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,0.0,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,0.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,0.0,4,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,0.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,3e-06


Let's get the distance between atoms first.

In [17]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

In [18]:
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    df = reduce_mem_usage(df)
    return df

train = create_features(train)
test = create_features(test)

Mem. usage decreased to 950.66 Mb (69.9% reduction)
Mem. usage decreased to 477.89 Mb (70.2% reduction)


In [19]:
len(train.columns)

88

In [20]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y,dist,dist_x,dist_y,dist_z,type_0,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.000221,1.192383,3.6e-05,1,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-6.67572e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,1.783203,1.019531,2.160156,3e-06,2,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,1.783203,0.294922,2.113281,0.771973,2,10,1.506836,1.091797,1.783203,4,2,0.728027,1.358398,-0.088745,0.938673,1.463867,0.01622,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,1.783203,0.276611,2.085938,0.817871,2,10,1.506836,1.091797,1.783203,4,3,0.728027,1.358398,-0.079163,0.944936,1.463867,0.025818,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-8.821487e-06,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,1.049805,0.142822,6e-05,1,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5.245209e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,1.783203,2.410156,0.000263,0.769043,2,10,1.506836,1.091797,1.783203,3,2,0.300049,1.324219,-0.123779,0.914494,1.447266,0.0,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,-5e-06,1.0,1.783203,0.0,1.0,1.783203,-1.019239e-05,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1.1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,1.783203,2.357422,0.000667,0.821289,2,10,1.506836,1.091797,1.783203,3,3,0.300049,1.324219,-0.114197,0.920596,1.447266,0.009598,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,3e-06,1.0,1.783203,8e-06,1.0,1.783203,-5.364418e-07,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-2e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.278809,0.130859,0.782715,1,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,1.783203,0.000289,9.2e-05,3.179688,2,10,1.506836,1.091797,1.783203,2,3,0.361328,1.261719,-0.176025,0.877557,1.4375,0.0,0.249023,0.635254,1.4375,-0.345703,0.806152,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.48877,-1.293945,0.27417,1.783203,3e-06,1.0,1.783203,9e-06,1.0,1.783203,0.0,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-2e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.26123,0.124023,0.807129,1,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,0.0,1.0,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1.132488e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,2e-06,1.0,1.091797,1.091797,3e-06,-1.091797


In [21]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df

def create_closest(df_train):
    #I apologize for my poor coding skill. Please make the better one.
    df_temp=df_train.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    for atom_idx in [0,1]:
        df_train = map_atom_info(df_train,df_temp, atom_idx)
        df_train = df_train.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'distance_closest': f'distance_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})
    return df_train

#dtrain = create_closest(train)
#dtest = create_closest(test)
#print('dtrain size',dtrain.shape)
#print('dtest size',dtest.shape)

### cosine angles calculation

In [None]:
def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    return df
    
# train = add_cos_features(train)
# test = add_cos_features(test)

#print('train size',train.shape)
#print('test size',test.shape)

In [22]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y,dist,dist_x,dist_y,dist_z,type_0,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.000221,1.192383,3.6e-05,1,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,1.783203,1.019531,2.160156,3e-06,2,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,1.783203,0.294922,2.113281,0.771973,2,10,1.506836,1.091797,1.783203,4,2,0.728027,1.358398,-0.088745,0.938673,1.463867,0.01622,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,1.783203,0.276611,2.085938,0.817871,2,10,1.506836,1.091797,1.783203,4,3,0.728027,1.358398,-0.079163,0.944936,1.463867,0.025818,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-9e-06,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,1.049805,0.142822,6e-05,1,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797


Dropping molecule_name and encode atom_0, atom_1 and type_0.<br>
**@TODO:** Try others encoders 

In [23]:
del_cols_list = ['id','molecule_name','sd','pso','dso']
def del_cols(df, cols):
    del_cols_list_ = [l for l in del_cols_list if l in df]
    df = df.drop(del_cols_list_,axis=1)
    return df

train = del_cols(train,del_cols_list)
test = del_cols(test,del_cols_list)

In [24]:
train.head()

Unnamed: 0,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y,dist,dist_x,dist_y,dist_z,type_0,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,1,0,1JHC,84.8125,83.0,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.000221,1.192383,3.6e-05,1,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
1,1,2,2JHH,-11.257812,-11.03125,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,1.783203,1.019531,2.160156,3e-06,2,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
2,1,3,2JHH,-11.257812,-11.03125,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,1.783203,0.294922,2.113281,0.771973,2,10,1.506836,1.091797,1.783203,4,2,0.728027,1.358398,-0.088745,0.938673,1.463867,0.01622,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
3,1,4,2JHH,-11.257812,-11.03125,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,1.783203,0.276611,2.085938,0.817871,2,10,1.506836,1.091797,1.783203,4,3,0.728027,1.358398,-0.079163,0.944936,1.463867,0.025818,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-9e-06,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
4,2,0,1JHC,84.8125,83.0,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,1.049805,0.142822,6e-05,1,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797


In [29]:
train['type'].unique()

array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'],
      dtype=object)

In [32]:
train_1JHC = train[train['type'] == '1JHC']

In [33]:
train_1JHC.head()

Unnamed: 0,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y,dist,dist_x,dist_y,dist_z,type_0,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,1,0,1JHC,84.8125,83.0,1,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.000221,1.192383,3.6e-05,1,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
4,2,0,1JHC,84.8125,83.0,2,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,1.049805,0.142822,6e-05,1,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
7,3,0,1JHC,84.8125,83.0,3,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.278809,0.130859,0.782715,1,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
9,4,0,1JHC,84.8125,83.0,4,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.0,0,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,3e-06,1.091797,0.26123,0.124023,0.807129,1,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,0.0,1.0,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
17,2,0,1JHC,171.25,170.5,2,H,-0.027802,2.199219,0.014153,2.199219,0.429932,1,1.066406,0.0,0,C,-0.013321,1.132812,0.008278,2.550781,0.819824,2,1.109375,0.042572,1.066406,0.00021,1.137695,3.5e-05,1,2,1.642578,1.066406,2.21875,2,1,0.011055,0.556641,-0.575684,0.491541,1.132812,0.0,0.814453,0.00449,1.642578,0.575684,1.540039,2.21875,1.151367,2.080078,1.066406,0.0,1.0,0.814453,-0.252197,0.763672,1.066406,0.0,1.0,1.066406,0.0,1.0,1.066406,0.0,1.0,,,,1.066406,1.066406,0.0,1.0,,,,,1.066406,0.0,1.0,1.066406,1.066406,,


In [34]:
train_2JHH = train[train['type'] == '2JHH']

In [35]:
train_1JHN = train[train['type'] == '1JHN']

In [36]:
train_2JHN = train[train['type'] == '2JHN']

In [37]:
train_2JHC = train[train['type'] == '2JHC']
train_3JHH = train[train['type'] == '3JHH']
train_3JHC = train[train['type'] == '3JHC']
train_3JHN = train[train['type'] == '3JHN']

In [38]:
train_3JHC.head()

Unnamed: 0,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,atom_index_x,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,bond_lengths_std_x,atom_index_y,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,bond_lengths_std_y,dist,dist_x,dist_y,dist_z,type_0,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
58,3,2,3JHC,4.550781,4.503906,3,H,0.998047,1.874023,0.002605,2.199219,0.429932,1,1.095703,0.0,2,C,0.018341,-1.191406,-0.004505,2.550781,0.819824,2,1.131836,0.070007,3.21875,0.959473,9.398438,5.1e-05,3,15,2.109375,1.061523,3.71875,5,4,0.293213,0.797852,1.989258,-0.669307,1.858398,3.050781,1.34668,0.625488,1.993164,-1.225586,0.619141,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,0.77832,-2.439453,0.241821,2.679688,-0.539551,0.83252,3.21875,0.0,1.0,1.061523,-2.158203,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.158203,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124756,1.039062,3.71875,3.21875,0.249634,-2.96875
63,4,2,3JHC,4.554688,4.507812,4,H,-0.541992,1.858398,-0.867188,2.199219,0.429932,1,1.095703,0.0,2,C,0.018341,-1.191406,-0.004505,2.550781,0.819824,2,1.131836,0.070007,3.21875,0.314209,9.304688,0.744141,3,15,2.109375,1.061523,3.71875,4,4,0.263428,0.532715,1.724609,-0.446886,1.848633,3.041016,1.396484,0.449463,2.048828,-1.169922,0.636719,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,0.887207,-2.332031,0.275635,2.679688,-0.539551,0.83252,3.21875,4.5e-05,1.0,1.061523,-2.15625,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.15625,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124756,1.039062,3.71875,3.21875,0.249634,-2.96875
67,5,2,3JHC,4.554688,4.503906,5,H,-0.525391,1.848633,0.901367,2.199219,0.429932,1,1.095703,0.0,2,C,0.018341,-1.191406,-0.004505,2.550781,0.819824,2,1.131836,0.070007,3.21875,0.295654,9.242188,0.820801,3,15,2.109375,1.061523,3.71875,3,4,0.018112,0.093994,1.286133,-0.078889,1.464844,2.65625,1.330078,0.007313,2.142578,-1.076172,0.665527,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,1.061523,-2.15625,0.329834,2.679688,-0.539062,0.83252,3.21875,0.000195,1.0,1.061523,-2.15625,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.15625,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124939,1.039062,3.71875,3.21875,0.249634,-2.96875
68,6,0,3JHC,2.519531,2.527344,6,H,0.032318,-2.253906,-0.010262,2.199219,0.429932,1,1.061523,0.0,0,C,-0.017822,1.464844,0.010094,2.550781,0.819824,4,1.185547,0.155762,3.71875,0.002514,13.820312,0.000414,3,15,2.109375,1.061523,3.71875,3,4,0.018112,0.093994,-1.370117,0.064206,1.464844,0.0,1.330078,0.007313,2.347656,-1.370117,0.631348,3.71875,0.0,1.0,1.061523,-2.65625,0.2854,1.330078,-2.386719,0.35791,1.750977,-1.966797,0.470947,3.71875,0.0,1.0,1.095703,-2.623047,0.294678,1.311523,-2.40625,0.352783,2.193359,1.061523,-2.65625,0.2854,0.972168,-2.746094,0.249634,-3.46875,3.34375,-0.374512,0.899414,3.71875,3.21875,0.249634,-3.46875
108,3,2,3JHC,2.513672,2.548828,3,H,0.979492,1.964844,0.030991,2.199219,0.429932,1,1.095703,0.0,2,C,0.72168,-0.525879,-1.262695,2.550781,0.819824,4,1.204102,0.188721,2.818359,0.066467,6.203125,1.672852,3,43,2.175781,1.094727,3.505859,7,8,0.619141,0.588867,1.115234,-1.120066,1.948242,2.474609,1.148438,0.80957,2.175781,-0.641602,0.772461,3.078125,0.260986,1.092773,1.095703,-1.72168,0.388916,0.689453,-2.128906,0.244629,2.09375,-0.724121,0.743164,3.505859,0.688477,1.244141,1.094727,-1.723633,0.388428,0.929199,-1.888672,0.329834,2.033203,1.094727,-1.723633,0.388428,0.78125,-2.037109,0.338867,-2.478516,3.046875,0.229492,1.081055,3.505859,2.818359,0.355713,-2.462891


In [None]:
def encode_categoric_single(df):
    lbl = LabelEncoder()
    cat_cols=[]
    try:
        cat_cols = df.describe(include=['O']).columns.tolist()
        for cat in cat_cols:
            df[cat] = lbl.fit_transform(list(df[cat].values))
    except Exception as e:
        print('error: ', str(e) )

    return df

In [None]:
def encode_categoric(dtrain,dtest):
    lbl = LabelEncoder()
    objs_n = len(dtrain)
    dfmerge = pd.concat(objs=[dtrain,dtest],axis=0)
    cat_cols=[]
    try:
        cat_cols = dfmerge.describe(include=['O']).columns.tolist()
        for cat in cat_cols:
            dfmerge[cat] = lbl.fit_transform(list(dfmerge[cat].values))
    except Exception as e:
        print('error: ', str(e) )

    dtrain = dfmerge[:objs_n]
    dtest = dfmerge[objs_n:]
    return dtrain,dtest


In [None]:
train = encode_categoric_single(train)
test = encode_categoric_single(test)

In [None]:
y_fc = train['fc']
X = train.drop(['scalar_coupling_constant','fc'],axis=1)
y = train['scalar_coupling_constant']

X_test = test.copy()

In [None]:
print('X size',X.shape)
print('X_test size',X_test.shape)
print('dtest size',test.shape)
print('y_fc size',y_fc.shape)

del train, test


In [None]:
gc.collect()

In [None]:
good_columns = ['type',
 'bond_lengths_mean_y',
 'bond_lengths_std_y',
 'bond_lengths_mean_x',
 'molecule_atom_index_0_dist_min_div',
 'molecule_atom_index_0_dist_std_div',
 'molecule_atom_index_0_dist_mean',
 'molecule_atom_index_0_dist_max',
 'dist_y',
 'molecule_atom_index_1_dist_std_diff',
 'z_0',
 'molecule_type_dist_min',
 'molecule_atom_index_0_y_1_mean_div',
 'dist_x',
 'x_0',
 'y_0',
 'molecule_type_dist_std',
 'molecule_atom_index_0_y_1_std',
 'molecule_dist_mean',
 'molecule_atom_index_0_dist_std_diff',
 'dist_z',
 'molecule_atom_index_0_dist_std',
 'molecule_atom_index_0_x_1_std',
 'molecule_type_dist_std_diff',
 'molecule_type_0_dist_std',
 'dist',
 'molecule_atom_index_0_dist_mean_diff',
 'molecule_atom_index_1_dist_min_div',
 'molecule_atom_index_1_dist_mean_diff',
 'y_1',
 'molecule_type_dist_mean_div',
 'molecule_dist_max',
 'molecule_atom_index_0_dist_mean_div',
 'z_1',
 'molecule_atom_index_0_z_1_std',
 'molecule_atom_index_1_dist_mean_div',
 'molecule_atom_index_1_dist_min_diff',
 'molecule_atom_index_1_dist_mean',
 'molecule_atom_index_1_dist_min',
 'molecule_atom_index_1_dist_max',
 'molecule_type_0_dist_std_diff',
 'molecule_atom_index_0_dist_min_diff',
 'molecule_type_dist_mean_diff',
 'x_1',
 'molecule_atom_index_0_y_1_max',
 'molecule_atom_index_0_y_1_mean_diff',
 'molecule_atom_1_dist_std_diff',
 'molecule_atom_index_0_y_1_mean',
 'molecule_atom_1_dist_std',
 'molecule_type_dist_max']

In [None]:
len(good_columns)

In [None]:
X = X[good_columns].copy()
X_test = X_test[good_columns].copy()

In [None]:
X.head(20)