In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max.rows', 150)
pd.set_option('display.max.columns', 150)

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
from xgboost import XGBRegressor

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial
from tqdm import tqdm_notebook as tqdm

from category_encoders import OrdinalEncoder, OneHotEncoder
# import eli5
# from eli5.sklearn import PermutationImportance

from scipy.stats import randint, uniform

In [2]:
from functools import reduce

In [3]:
plt.rcParams['figure.figsize'] = [30, 50]

In [4]:
train = pd.read_csv(f'./data/train.csv')
test = pd.read_csv(f'./data/test.csv')
trn_ob = pd.read_csv(f'./data/train_ob_charges.csv')  # open bab  https://www.kaggle.com/asauve/v7-estimation-of-mulliken-charges-with-open-babel/output
tst_ob = pd.read_csv(f'./data/test_ob_charges.csv')   # open bab
trn_geom = pd.read_csv(f'./data/train_geom.csv')
tst_geom = pd.read_csv(f'./data/test_geom.csv')
mulliken_charges = pd.read_csv(f'./data/mulliken_charges.csv')
mulliken_charges_test = pd.read_csv(f'./data/mulliken_charges_test_set.csv')  # https://www.kaggle.com/borisdee/predicting-mulliken-charges-with-acsf-descriptors
structures = pd.read_csv(f'./data/structures.csv')

# dipole_moments = pd.read_csv(f'./data/dipole_moments.csv')
# potential_energy = pd.read_csv(f'./data/potential_energy.csv')

## Reduce Memory Function

In [5]:
def reduce_mem_usage(df, verbose=True):
    """
    This function reduces the numeric to the least possible numeric type that fits the data so 
    memory usage during transforming and training will be reduced.
    Taken from: https://www.kaggle.com/todnewman/keras-neural-net-for-champs
    
    Han
    Parameters:
    ===========
    dataframe: input dataframe 
    verbose: verbose mode, default True.
    Output:
    ===========
    dataframe: dataframe with numeric columns types changed to the least possible size
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Create features on structures

In [6]:
# %%time
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

fudge_factor = 0.05
atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
print(atomic_radius)

electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

print('Calculating bonds')

for i in tqdm(range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

print('Counting and condensing bonds')

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)
display(structures.head(10))

{'H': 0.43, 'C': 0.8200000000000001, 'N': 0.8, 'O': 0.78, 'F': 0.76}


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))


Calculating bonds


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


Counting and condensing bonds


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,n_bonds,bond_lengths_mean
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,3,1.017195
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,1,1.01719
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,1,1.017187
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,1,1.017208
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,2,0.962107


In [7]:
trn_geom.drop(['id', 'scalar_coupling_constant', 'num_atoms', 'type'], axis=1, inplace=True)

In [8]:
trn_geom.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,flatness_metric,bond_angle_plane,bond_angle_axis
0,dsgdb9nsd_000001,1,0,0.443763,52.084336,34.460723
1,dsgdb9nsd_000001,1,2,0.443763,73.351369,0.804151
2,dsgdb9nsd_000001,1,3,0.443763,39.073646,44.696643
3,dsgdb9nsd_000001,1,4,0.443763,20.124275,44.160491
4,dsgdb9nsd_000001,2,0,0.443763,50.858953,36.069061


In [9]:
tst_geom.drop(['id', 'num_atoms', 'type'], axis=1, inplace=True)

In [10]:
tst_geom.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,flatness_metric,bond_angle_plane,bond_angle_axis
0,dsgdb9nsd_000004,2,0,0.0,0.0,90.0
1,dsgdb9nsd_000004,2,1,0.0,0.0,90.0
2,dsgdb9nsd_000004,2,3,0.0,0.0,90.0
3,dsgdb9nsd_000004,3,0,0.0,0.0,90.0
4,dsgdb9nsd_000004,3,1,0.0,0.0,90.0


In [11]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,n_bonds,bond_lengths_mean
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948


In [12]:
trn_ob.drop(['Unnamed: 0', 'error'], axis=1, inplace=True)

In [13]:
tst_ob.drop(['Unnamed: 0', 'error'], axis=1, inplace=True)

In [14]:
tst_ob.head(10)

Unnamed: 0,molecule_name,atom_index,eem,mmff94,gasteiger,qeq,qtpie,eem2015ha,eem2015hm,eem2015hn,eem2015ba,eem2015bm,eem2015bn
0,dsgdb9nsd_000004,0,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054
1,dsgdb9nsd_000004,1,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054
2,dsgdb9nsd_000004,2,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054
3,dsgdb9nsd_000004,3,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054
4,dsgdb9nsd_000015,0,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147
5,dsgdb9nsd_000015,1,-0.459876,-0.56,-0.486559,0.831692,-0.488505,-1.034775,-0.605457,-0.579823,-0.878321,-0.45034,-0.512405
6,dsgdb9nsd_000015,2,-0.200372,0.28,-0.018924,-0.438608,0.348799,0.462936,-0.319256,-0.257585,0.34792,-0.405719,-0.346147
7,dsgdb9nsd_000015,3,0.139347,0.0,0.087401,0.035403,-0.051906,0.010205,0.203472,0.17764,0.022974,0.20721,0.196381
8,dsgdb9nsd_000015,4,0.139342,0.0,0.087401,0.035459,-0.051938,0.010189,0.203467,0.177634,0.02296,0.207206,0.196376
9,dsgdb9nsd_000015,5,0.151621,0.0,0.087401,-0.0481,-0.000702,0.034058,0.215045,0.192222,0.045308,0.216473,0.209592


In [15]:
print('====', len(structures), len(trn_ob)+len(tst_ob), len(trn_ob), len(tst_ob))

==== 2358657 2358657 1533537 825120


In [16]:
structures_train = pd.merge(structures, trn_ob, on=['molecule_name', 'atom_index'])
structures_test = pd.merge(structures, tst_ob, on=['molecule_name', 'atom_index'])

In [17]:
len(structures_test)

825120

In [18]:
structures_test.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,n_bonds,bond_lengths_mean,eem,mmff94,gasteiger,qeq,qtpie,eem2015ha,eem2015hm,eem2015hn,eem2015ba,eem2015bm,eem2015bn
0,dsgdb9nsd_000004,0,C,0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054
1,dsgdb9nsd_000004,1,C,-0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054
2,dsgdb9nsd_000004,2,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054
3,dsgdb9nsd_000004,3,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054
4,dsgdb9nsd_000015,0,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147


In [19]:
trn_ob.columns

Index(['molecule_name', 'atom_index', 'eem', 'mmff94', 'gasteiger', 'qeq',
       'qtpie', 'eem2015ha', 'eem2015hm', 'eem2015hn', 'eem2015ba',
       'eem2015bm', 'eem2015bn'],
      dtype='object')

## Merge structures function

In [20]:
# def merge_train_structures(train, structures, dip_mom, pot_energ, train_geom):
def merge_train_structures(train, structures, train_geom):
  
    """This function is used to merge the structures dataset to the 
     original train dataset

     Parameters:
     ===========
       train: train dataframe
       structures: structures dataframe.

     Output:
       ===========
       dataframe: merged dataframe
     """

    structures = structures.rename({'atom_index': 'atom_index_0',
                                    'x':'x_0', 'y':'y_0', 'z':'z_0', 'atom':'atom_0',
                                    'eem':'eem_0', 'mmff94':'mmff94_0', 'gasteiger':'gasteiger_0', 
                                    'qeq':'qeq_0', 'qtpie':'qtpie_0', 'eem2015ha':'eem2015ha_0', 
                                    'eem2015hm':'eem2015hm_0', 'eem2015hn':'eem2015hn_0', 'eem2015ba':'eem2015ba_0',
                                    'eem2015bm':'eem2015bm_0', 'eem2015bn':'eem2015bn_0'}, axis=1)

    merged = pd.merge(train, structures, on=['molecule_name', 'atom_index_0'])

    structures = structures.rename({'atom_index_0': 'atom_index_1',
                                    'x_0':'x_1', 'y_0':'y_1', 'z_0':'z_1', 'atom_0':'atom_1',
                                    'eem_0':'eem_1', 'mmff94_0':'mmff94_1', 'gasteiger_0':'gasteiger_1', 
                                    'qeq_0':'qeq_1', 'qtpie_0':'qtpie_1', 'eem2015ha_0':'eem2015ha_1', 
                                    'eem2015hm_0':'eem2015hm_1', 'eem2015hn_0':'eem2015hn_1', 'eem2015ba_0':'eem2015ba_1',
                                    'eem2015bm_0':'eem2015bm_1', 'eem2015bn_0':'eem2015bn_1'}, axis=1)

    merged_1 = pd.merge(merged, structures, on=['molecule_name', 'atom_index_1'])

    structures = structures.rename({'atom_index_1': 'atom_index',
                                  'x_1':'x', 'y_1':'y', 'z_1':'z',
                                  'atom_1':'atom'}, axis=1)
    
#     dip_mom = dip_mom.rename({'X':'X_dipm', 'Y':'Y_dipm', 'Z':'Z_dipm'}, axis=1)
#     merged_2 = pd.merge(merged_1, dip_mom, on=['molecule_name',])
#     merged_3 = pd.merge(merged_2, pot_energ, on=['molecule_name',])
    
    merged_2 = pd.merge(merged_1, train_geom, on=['molecule_name', 'atom_index_0', 'atom_index_1'])
    
    assert train.shape[0] == merged.shape[0]

    return merged_2

In [21]:
merged = merge_train_structures(train, structures_train, trn_geom)

In [22]:
merged_test = merge_train_structures(test, structures_test, tst_geom)

In [23]:
len(merged_test) == len(test)

True

In [24]:
merged_test.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
1,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
2,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,-0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
3,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,-0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
4,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,0.0,0.0,90.0
5,4658152,dsgdb9nsd_000015,3,0,1JHC,H,1.005284,1.810158,0.004656,2.2,0.43,1,1.102328,0.139347,0.0,0.087401,0.035403,-0.051906,0.010205,0.203472,0.17764,0.022974,0.20721,0.196381,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0.396373,53.959413,3.242966
6,4658156,dsgdb9nsd_000015,4,0,1JHC,H,-0.546896,1.793435,-0.872511,2.2,0.43,1,1.102327,0.139342,0.0,0.087401,0.035459,-0.051938,0.010189,0.203467,0.177634,0.02296,0.207206,0.196376,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0.396373,53.98408,3.218983
7,4658159,dsgdb9nsd_000015,5,0,1JHC,H,-0.530029,1.72292,0.911017,2.2,0.43,1,1.092852,0.151621,0.0,0.087401,-0.0481,-0.000702,0.034058,0.215045,0.192222,0.045308,0.216473,0.209592,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0.396373,0.005499,51.26138
8,4658161,dsgdb9nsd_000015,6,0,3JHC,H,0.139938,-0.255993,-2.050984,2.2,0.43,1,1.102327,0.139346,0.0,0.087401,0.035407,-0.051908,0.010203,0.203472,0.177639,0.022972,0.207209,0.196381,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0.396373,19.746778,65.364682
9,4658165,dsgdb9nsd_000015,7,0,3JHC,H,1.692653,-0.238684,-1.174777,2.2,0.43,1,1.102328,0.139343,0.0,0.087401,0.035454,-0.051936,0.010191,0.203468,0.177635,0.022961,0.207207,0.196377,C,-0.014821,1.392412,0.005671,2.55,0.82,4,1.175898,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0.396373,19.722896,65.361512


In [25]:
mulliken = mulliken_charges.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
merged = merged.merge(mulliken, on=['molecule_name', 'atom_index_0'])

mulliken = mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
merged = merged.merge(mulliken, on=['molecule_name', 'atom_index_1'])
len(merged) == len(train)

True

In [26]:
mulliken_charges_test.head()

Unnamed: 0,molecule_name,atom_index,mulliken_charge
0,dsgdb9nsd_000004,0,-0.382772
1,dsgdb9nsd_000004,1,-0.382772
2,dsgdb9nsd_000004,2,0.175373
3,dsgdb9nsd_000004,3,0.175373
4,dsgdb9nsd_000015,0,-0.227002


In [27]:
merged_test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
1,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
2,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,-0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
3,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,C,-0.599539,0.0,1.0,2.55,0.82,2,1.130589,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0.0,0.0,90.0
4,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,H,1.661639,0.0,1.0,2.2,0.43,1,1.062099,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,0.0,0.0,90.0


In [28]:
test_mulliken = mulliken_charges_test.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
merged_test = merged_test.merge(test_mulliken, on=['molecule_name', 'atom_index_0'])

test_mulliken = test_mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
merged_test = merged_test.merge(test_mulliken, on=['molecule_name', 'atom_index_1'])

In [29]:
len(merged_test) == len(test)

True

## Feature Engineering

In [30]:
%%time
# This block is SPPED UP

train_p_0 = merged[['x_0', 'y_0', 'z_0']].values
train_p_1 = merged[['x_1', 'y_1', 'z_1']].values

test_p_0 = merged_test[['x_0', 'y_0', 'z_0']].values
test_p_1 = merged_test[['x_1', 'y_1', 'z_1']].values

merged['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
merged_test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
assert len(merged) == len(train)

CPU times: user 3.72 s, sys: 2.08 s, total: 5.79 s
Wall time: 2.05 s


In [31]:
len(merged_test) == len(test)

True

In [32]:
external_data_0 = pd.DataFrame(data={
    'atom_0': ['C', 'H', 'N', 'O', 'F'],
    'atom_0_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_0': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_0': [4, 1, 5, 6, 7]
})
external_data_1 = pd.DataFrame(data={
    'atom_1': ['C', 'H', 'N', 'O', 'F'],
    'atom_1_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_1': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_1': [4, 1, 5, 6, 7]
})

def create_features(df):
    df['bond'] = df['type'].str[2:]
    df['j_type'] = df['type'].str[:2]
    df['distance'] = ((df['x_1'].values - df['x_0'].values)**2 + 
                    (df['y_1'].values - df['y_0'].values)**2 + 
                    (df['z_1'].values - df['z_0'].values)**2)** 0.5
    df['mu_0'] = np.sqrt(df['x_0'].values**2 + df['y_0'].values**2 + df['z_0'].values**2)
    df['mu_1'] = np.sqrt(df['x_1'].values**2 + df['y_1'].values**2 + df['z_1'].values**2)
    df = df.merge(external_data_0, on='atom_0', how='left')
    df = df.merge(external_data_1, on='atom_1', how='left')
    df['delta_en'] = (df['atom_0_en'] - df['atom_1_en']).abs()
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df['molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df['molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df['molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df['molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df['molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df['molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df['molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df['molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df['molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df['molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df['molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df['molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df['molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df['molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df['molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df['molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df['molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df['molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df['molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df['molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df['molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df['molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df['molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df['molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df['molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df['molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    
    df = reduce_mem_usage(df)
    return df

In [33]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df

def create_closest(df_train):
    #I apologize for my poor coding skill. Please make the better one.
    df_temp=df_train.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
#     df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    for atom_idx in [0,1]:
        df_train = map_atom_info(df_train,df_temp, atom_idx)
        df_train = df_train.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'distance_closest': f'distance_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})
    df_train = reduce_mem_usage(df_train)
    return df_train


In [34]:
merged = create_features(merged)
len(merged) == len(train)

Mem. usage decreased to 1003.97 Mb (70.3% reduction)


True

In [35]:
merged_test = create_features(merged_test)
len(merged_test) == len(test)

Mem. usage decreased to 535.24 Mb (70.2% reduction)


True

In [36]:
dtrain = create_closest(merged)

Mem. usage decreased to 1088.38 Mb (2.4% reduction)


In [37]:
dtrain_test = create_closest(merged_test)

Mem. usage decreased to 580.64 Mb (2.4% reduction)


In [38]:
len(dtrain) == len(train)

True

In [39]:
len(dtrain_test) == len(test)

True

In [40]:
def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    df = reduce_mem_usage(df)
    return df
    
merged = add_cos_features(dtrain)

Mem. usage decreased to 1132.80 Mb (0.0% reduction)


In [41]:
len(merged) == len(train)

True

In [42]:
merged_test = add_cos_features(dtrain_test)

Mem. usage decreased to 604.54 Mb (0.0% reduction)


In [43]:
len(merged_test) == len(test)

True

In [44]:
merged.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atom_0_en,atomic_mass_0,valence_electrons_0,atom_1_en,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,dist_y,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,52.09375,34.46875,0.133911,-0.535645,1.091797,HC,1J,1.091797,0.006702,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,1.358398,0.272949,1.463867,0.37793,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,1.091797,1.091797,-7e-06,1.0,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,50.84375,36.0625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.779297,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,1.324219,0.237915,1.447266,0.361816,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,1.091797,1.091797,-5e-06,1.0,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.90625,35.65625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.776367,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,1.261719,0.176025,1.4375,0.352051,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.0,1.0,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,1.091797,1.091797,0.0,1.0,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.125,34.875,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.77832,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,1.085938,0.0,1.085938,0.0,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,1.091797,1.091797,-1e-06,1.0,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,73.375,0.804199,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.779297,2.199219,1.007812,1,2.199219,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,1,1.358398,-0.10498,1.463867,0.0,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,1.783203,0.0,1.0,0,1.091797,-0.012695,1.085938,0.008003,0.0,1.091797,-0.012695,1.085938,0.008003,1.091797,1.091797,-0.333496,-1.333008,1.333984


In [45]:
merged_test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atom_0_en,atomic_mass_0,valence_electrons_0,atom_1_en,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,dist_y,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,0.599609,0.0,1.0,2.550781,0.819824,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,2.261719,HC,2J,2.261719,1.939453,1.166016,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,3,2,0.0,0.0,0.0,0.0,2.214844,-0.045654,0.97998,3.324219,1.0625,1.469727,1.0625,-1.199219,0.469727,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,1.662109,1.0625,-1.199219,0.469727,1,1.0625,-0.599609,0.0,1.0,,,,,,1.0625,,,,
1,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,0.599609,0.0,1.0,2.550781,0.819824,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,1.0625,HC,1J,1.0625,1.939453,1.166016,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,2,2,0.0,0.0,0.0,0.0,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,1.662109,1.0625,0.0,1.0,0,1.0625,0.599609,0.0,1.0,,,,,,1.0625,,,,
2,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,-0.599609,0.0,1.0,2.550781,0.819824,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,1.0625,HC,1J,1.0625,1.939453,1.166016,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,3,2,0.0,0.0,0.0,0.0,2.214844,1.15332,2.085938,3.324219,2.261719,3.128906,1.0625,0.0,1.0,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,1.662109,1.0625,0.0,1.0,1,1.0625,-0.599609,0.0,1.0,,,,,,1.0625,,,,
3,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,-0.599609,0.0,1.0,2.550781,0.819824,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,2.261719,HC,2J,2.261719,1.939453,1.166016,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,2,2,0.0,0.0,0.0,0.0,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,1.662109,1.0625,-1.199219,0.469727,0,1.0625,0.599609,0.0,1.0,,,,,,1.0625,,,,
4,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,H,1.662109,0.0,1.0,2.199219,0.429932,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,0.0,0.0,90.0,0.175415,0.175415,3.324219,HH,3J,3.324219,1.939453,1.939453,2.199219,1.007812,1,2.199219,1.007812,1,0.0,5,1.994141,1.0625,3.324219,3,1,0.0,0.0,0.0,0.0,2.214844,-1.107422,0.666504,3.324219,0.0,1.0,1.0625,-2.261719,0.31958,3.324219,0.0,1.0,3.324219,0.0,1.0,3.324219,0.0,1.0,3.324219,3.324219,0.0,1.0,1,1.0625,-0.599609,0.0,1.0,0.0,1.0625,0.599609,0.0,1.0,1.0625,1.0625,-1.0,-3.128906,3.128906


In [46]:
def add_extra_features(df):
    df['dip_cop_0'] = (3*(df['cos_0']**2)-1) / 2
    df['dip_cop_1'] = (3*(df['cos_1']**2)-1) / 2
    df['dip_cop_0_1'] = (3*(df['cos_0_1']**2)-1) / 2   
    df['dip_cop_plane'] = (3*(df['bond_angle_plane']**2)-1) / 200
    df['dip_cop_axis'] = (3*(df['bond_angle_axis']**2)-1) / 200
    df[['dip_cop_0', 'dip_cop_1', 'dip_cop_0_1']] = df[['dip_cop_0', 'dip_cop_1', 'dip_cop_0_1']].fillna(0)
    
    df = reduce_mem_usage(df)
    return df
    
merged = add_extra_features(merged)

Mem. usage decreased to 1177.22 Mb (0.0% reduction)


In [47]:
merged_test = add_extra_features(merged_test)

Mem. usage decreased to 628.43 Mb (0.0% reduction)


In [48]:
drop_feat = [
    'molecule_atom_index_0_dist_std_div',
    'atom_index_closest_1', 'dist', 'x_closest_1',
    'y_closest_1', 'z_closest_1', 'distance_1', 'mulliken_charge_0_x', 'mulliken_charge_0_y',
    'cos_0_1', 'cos_0', 'cos_1', 'EN_y', 'n_bonds_x',
    'molecule_atom_index_0_dist_min_diff',
    'molecule_atom_index_0_dist_min_div','rad_y', 'EN_x', 'rad_x',
    'dist_y', 'atom_1_en', 'atom_0_en', 'z_closest_0']

In [49]:
old_feats = merged.columns

In [50]:
old_feats_test = merged_test.columns

In [51]:
len(merged.columns)

114

In [52]:
new_cols = [f for f in old_feats if f not in drop_feat]

In [53]:
new_cols_test = [f for f in old_feats_test if f not in drop_feat]

In [54]:
len(new_cols)

94

In [55]:
len(new_cols_test)

93

In [56]:
st_new_cols_test = set(new_cols_test)
st_new_cols = set(new_cols)

In [59]:
st_new_cols - st_new_cols_test
# st_new_cols_test - st_new_cols

{'scalar_coupling_constant'}

In [60]:
merged_new = merged[new_cols]

In [61]:
merged_new_test = merged_test[new_cols_test]

In [62]:
merged_new.head(15)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_0,dip_cop_1,dip_cop_0_1,dip_cop_plane,dip_cop_axis
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,52.09375,34.46875,0.133911,-0.535645,1.091797,HC,1J,1.091797,0.006702,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,1.358398,0.272949,1.463867,0.37793,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-6.67572e-06,1.0,1.091797,1.091797,-7e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,40.71875,17.8125
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,50.84375,36.0625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.779297,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,1.324219,0.237915,1.447266,0.361816,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5.245209e-06,1.0,1.091797,1.091797,-5e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,38.8125,19.515625
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,H,-0.541016,1.447266,-0.876465,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.90625,35.65625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.776367,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,1.261719,0.176025,1.4375,0.352051,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,1.091797,1.091797,0.0,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,2.894531,19.0625
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,H,-0.523926,1.4375,0.90625,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.125,34.875,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.77832,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,1.085938,0.0,1.085938,0.0,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1.132488e-06,1.0,1.091797,1.091797,-1e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,2.580078,18.234375
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,1.011719,1.463867,0.000277,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,73.375,0.804199,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.779297,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,1,1.358398,-0.10498,1.463867,0.0,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,1.783203,0.0,1.0,0,-0.012695,1.085938,1.091797,2.166016,2.167969,-0.333252,80.75,0.004707
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,-0.541016,1.447266,-0.876465,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,39.0625,44.6875,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.776367,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,2,1.358398,-0.088745,1.463867,0.01622,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,1.783203,1.783203,-2.8e-05,1.0,0,-0.012695,1.085938,1.091797,2.166016,2.166016,-0.333496,22.875,29.953125
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,H,1.011719,1.463867,0.000277,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,-0.541016,1.447266,-0.876465,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,19.125,45.84375,0.133911,0.133911,1.783203,HH,2J,1.783203,1.779297,1.776367,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,3,2,1.324219,-0.123779,1.447266,0.0,1.552734,-0.230347,0.870605,1.783203,0.0,1.0,1.091797,1.783203,-5e-06,1.0,1.783203,0.0,1.0,1.783203,-1.019239e-05,1.0,1.783203,1.783203,-3.8e-05,1.0,0,-0.012695,1.085938,1.091797,2.166016,2.167969,-0.333008,5.480469,31.515625
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,-0.523926,1.4375,0.90625,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,20.125,44.15625,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.77832,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,3,1.358398,-0.079163,1.463867,0.025818,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-8.821487e-06,1.0,1.783203,1.783203,-3.7e-05,1.0,0,-0.012695,1.085938,1.091797,2.167969,2.162109,-0.333496,6.070312,29.234375
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,H,1.011719,1.463867,0.000277,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,-0.523926,1.4375,0.90625,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,37.875,45.28125,0.133911,0.133911,1.783203,HH,2J,1.783203,1.779297,1.77832,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,3,3,1.324219,-0.114197,1.447266,0.009598,1.552734,-0.230347,0.870605,1.783203,9e-06,1.0,1.091797,1.783203,3e-06,1.0,1.783203,8e-06,1.0,1.783203,-5.364418e-07,1.0,1.783203,1.783203,-2.9e-05,1.0,0,-0.012695,1.085938,1.091797,2.167969,2.167969,-0.333008,21.515625,30.765625
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,H,-0.541016,1.447266,-0.876465,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,-0.523926,1.4375,0.90625,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,16.640625,0.382568,0.133911,0.133911,1.783203,HH,2J,1.783203,1.776367,1.77832,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,2,3,1.261719,-0.176025,1.4375,0.0,1.4375,-0.345703,0.806152,1.783203,0.0,1.0,1.091797,1.783203,3e-06,1.0,1.783203,9e-06,1.0,1.783203,0.0,1.0,1.783203,1.783203,-2.8e-05,1.0,0,-0.012695,1.085938,1.091797,2.166016,2.167969,-0.333496,4.148438,-0.002806


In [63]:
merged_new_test.head(15)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_0,dip_cop_1,dip_cop_0_1,dip_cop_plane,dip_cop_axis
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.662109,0.0,1.0,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,0.599609,0.0,1.0,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,2.261719,HC,2J,2.261719,1.939453,1.166016,1.007812,1,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,3,2,0.0,0.0,0.0,0.0,2.214844,-0.045654,0.97998,3.324219,1.0625,1.469727,1.0625,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,1.662109,1.0625,-1.199219,0.469727,1,-0.599609,0.0,1.0625,0.0,0.0,0.0,-0.005001,121.5
1,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.662109,0.0,1.0,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,0.599609,0.0,1.0,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,1.0625,HC,1J,1.0625,1.939453,1.166016,1.007812,1,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,2,2,0.0,0.0,0.0,0.0,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,1.662109,1.0625,0.0,1.0,0,0.599609,0.0,1.0625,0.0,0.0,0.0,-0.005001,121.5
2,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.662109,0.0,1.0,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,-0.599609,0.0,1.0,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,1.0625,HC,1J,1.0625,1.939453,1.166016,1.007812,1,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,3,2,0.0,0.0,0.0,0.0,2.214844,1.15332,2.085938,3.324219,2.261719,3.128906,1.0625,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,1.662109,1.0625,0.0,1.0,1,-0.599609,0.0,1.0625,0.0,0.0,0.0,-0.005001,121.5
3,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.662109,0.0,1.0,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,C,-0.599609,0.0,1.0,2,1.130859,-0.140259,-0.177002,-0.195557,0.162109,-0.104797,0.186768,-0.244629,-0.126099,0.188354,-0.23291,-0.126099,0.0,0.0,90.0,0.175415,-0.382812,2.261719,HC,2J,2.261719,1.939453,1.166016,1.007812,1,12.007812,4,0.350098,5,1.994141,1.0625,3.324219,2,2,0.0,0.0,0.0,0.0,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,1.662109,1.0625,-1.199219,0.469727,0,0.599609,0.0,1.0625,0.0,0.0,0.0,-0.005001,121.5
4,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.662109,0.0,1.0,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,H,1.662109,0.0,1.0,1,1.0625,0.140259,0.177002,0.195557,-0.162109,0.104797,-0.186768,0.244629,0.126099,-0.188354,0.23291,0.126099,0.0,0.0,90.0,0.175415,0.175415,3.324219,HH,3J,3.324219,1.939453,1.939453,1.007812,1,1.007812,1,0.0,5,1.994141,1.0625,3.324219,3,1,0.0,0.0,0.0,0.0,2.214844,-1.107422,0.666504,3.324219,0.0,1.0,1.0625,3.324219,0.0,1.0,3.324219,0.0,1.0,3.324219,0.0,1.0,3.324219,3.324219,0.0,1.0,1,-0.599609,0.0,1.0625,14.1875,14.1875,1.0,-0.005001,121.5
5,4658152,dsgdb9nsd_000015,3,0,1JHC,H,1.004883,1.810547,0.004658,1.102539,0.139404,0.0,0.087402,0.0354,-0.05191,0.010208,0.203491,0.177612,0.02298,0.207153,0.196411,C,-0.014824,1.392578,0.005672,4,1.175781,-0.200317,0.280029,-0.018921,-0.438721,0.348877,0.462891,-0.319336,-0.257568,0.3479,-0.405762,-0.346191,0.396484,53.96875,3.242188,0.106445,-0.227051,1.102539,HC,1J,1.102539,2.070312,1.392578,1.007812,1,12.007812,4,0.350098,18,1.910156,1.092773,3.261719,4,6,1.088867,-0.303467,1.792969,0.401123,1.828125,0.725098,1.658203,2.640625,1.538086,2.394531,1.102539,1.973633,0.871094,1.790039,3.261719,2.160156,2.958984,1.092773,-0.009476,0.991211,1.973633,1.092773,-0.009476,0.991211,0,-0.014824,1.392578,1.101562,0.0,0.0,0.0,43.6875,0.15271
6,4658156,dsgdb9nsd_000015,4,0,1JHC,H,-0.546875,1.792969,-0.872559,1.102539,0.139282,0.0,0.087402,0.035461,-0.051941,0.010193,0.203491,0.177612,0.022964,0.207153,0.196411,C,-0.014824,1.392578,0.005672,4,1.175781,-0.200317,0.280029,-0.018921,-0.438721,0.348877,0.462891,-0.319336,-0.257568,0.3479,-0.405762,-0.346191,0.396484,53.96875,3.21875,0.106445,-0.227051,1.102539,HC,1J,1.102539,2.068359,1.392578,1.007812,1,12.007812,4,0.350098,18,1.910156,1.092773,3.261719,3,6,0.854004,-0.538574,1.722656,0.330566,1.842773,0.740234,1.670898,2.640625,1.537109,2.394531,1.102539,1.973633,0.871094,1.790039,3.261719,2.160156,2.958984,1.092773,-0.009476,0.991211,1.973633,1.092773,-0.009476,0.991211,0,-0.014824,1.392578,1.102539,0.0,0.0,0.0,43.6875,0.150391
7,4658159,dsgdb9nsd_000015,5,0,1JHC,H,-0.529785,1.722656,0.911133,1.092773,0.151611,0.0,0.087402,-0.048096,-0.000701,0.034058,0.215088,0.192261,0.045319,0.216431,0.209595,C,-0.014824,1.392578,0.005672,4,1.175781,-0.200317,0.280029,-0.018921,-0.438721,0.348877,0.462891,-0.319336,-0.257568,0.3479,-0.405762,-0.346191,0.396484,0.005501,51.25,0.132446,-0.227051,1.092773,HC,1J,1.092773,2.019531,1.392578,1.007812,1,12.007812,4,0.350098,18,1.910156,1.092773,3.261719,2,6,0.419678,-0.972656,1.392578,0.0,2.177734,1.084961,1.993164,3.261719,2.169922,2.986328,1.092773,1.973633,0.880371,1.805664,3.261719,2.169922,2.986328,1.092773,0.0,1.0,1.973633,1.092773,0.0,1.0,0,-0.014824,1.392578,1.092773,0.0,0.0,0.0,-0.005001,39.40625
8,4658161,dsgdb9nsd_000015,6,0,3JHC,H,0.139893,-0.256104,-2.050781,1.102539,0.139404,0.0,0.087402,0.0354,-0.05191,0.010201,0.203491,0.177612,0.02298,0.207153,0.196411,C,-0.014824,1.392578,0.005672,4,1.175781,-0.200317,0.280029,-0.018921,-0.438721,0.348877,0.462891,-0.319336,-0.257568,0.3479,-0.405762,-0.346191,0.396484,19.75,65.375,0.106445,-0.227051,2.640625,HC,3J,2.640625,2.072266,1.392578,1.007812,1,12.007812,4,0.350098,18,1.910156,1.092773,3.261719,4,6,-0.260254,-1.652344,1.392578,0.0,1.828125,-0.8125,0.692383,2.640625,0.0,1.0,1.102539,1.973633,-0.666992,0.747559,3.261719,0.62207,1.235352,1.092773,-1.547852,0.413818,1.973633,1.092773,-1.547852,0.413818,2,0.638184,-0.553223,1.102539,0.0,0.0,0.0,5.84375,64.0625
9,4658165,dsgdb9nsd_000015,7,0,3JHC,H,1.692383,-0.238647,-1.174805,1.102539,0.139282,0.0,0.087402,0.035461,-0.051941,0.010193,0.203491,0.177612,0.022964,0.207153,0.196411,C,-0.014824,1.392578,0.005672,4,1.175781,-0.200317,0.280029,-0.018921,-0.438721,0.348877,0.462891,-0.319336,-0.257568,0.3479,-0.405762,-0.346191,0.396484,19.71875,65.375,0.106445,-0.227051,2.640625,HC,3J,2.640625,2.074219,1.392578,1.007812,1,12.007812,4,0.350098,18,1.910156,1.092773,3.261719,3,6,-0.267578,-1.660156,1.392578,0.0,1.842773,-0.797363,0.697754,2.640625,0.0,1.0,1.102539,1.973633,-0.666504,0.747559,3.261719,0.622559,1.235352,1.092773,-1.546875,0.414062,1.973633,1.092773,-1.546875,0.414062,2,0.638184,-0.553223,1.102539,0.0,0.0,0.0,5.824219,64.0625


In [64]:
merged_new = merged_new.fillna(0)

In [65]:
train_dist = pd.read_csv(f'./data/train_distance_data.csv')
test_dist = pd.read_csv(f'./data/test_distance_data.csv')

In [66]:
train_dist = reduce_mem_usage(train_dist)

Mem. usage decreased to 653.03 Mb (75.2% reduction)


In [67]:
test_dist = reduce_mem_usage(test_dist)

Mem. usage decreased to 653.03 Mb (75.2% reduction)


In [68]:
train_dist.drop(['Unnamed: 0', 'molecule_index', 'scalar_coupling_constant', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1, inplace=True)
test_dist.drop(['Unnamed: 0', 'molecule_index', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1, inplace=True)

In [69]:
train_dist.head()

Unnamed: 0,id,atom_index_0,atom_index_1,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,x_9,y_7,y_8,y_9,z_7,z_8,z_9,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
0,10,1,0,1,1,0,0,0,0.916016,-0.520508,,,,1.358398,1.34375,,,,-0.028763,-0.775391,,,,1.017578,1.618164,1.017578,1.619141,1.017578,1.619141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,13,2,0,1,1,0,0,0,0.017258,-0.520508,,,,0.012543,1.34375,,,,-0.027374,-0.775391,,,,1.017578,1.618164,1.017578,1.619141,1.017578,1.619141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,15,3,0,1,1,0,0,0,0.916016,0.017258,,,,1.358398,0.012543,,,,-0.028763,-0.027374,,,,1.017578,1.619141,1.017578,1.619141,1.017578,1.618164,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,97,3,0,1,6,8,1,0,-0.908203,0.046478,1.072266,-0.961426,,1.827148,-0.011742,-0.652344,-0.475098,,0.018921,0.001204,-0.011131,0.008072,,1.007812,1.734375,1.004883,2.050781,1.359375,2.072266,2.548828,2.28125,3.173828,1.208984,2.960938,2.046875,2.302734,1.109375,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,101,4,0,1,6,1,8,0,0.825195,0.046478,-0.961426,1.072266,,1.884766,-0.011742,-0.475098,-0.652344,,0.003738,0.001204,0.008072,-0.011131,,1.004883,1.734375,1.007812,2.072266,1.359375,2.050781,2.302734,2.046875,2.960938,1.109375,3.173828,2.28125,2.548828,1.208984,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [70]:
merged_trn_dist = pd.merge(merged_new, train_dist, on=['id', 'atom_index_0', 'atom_index_1'], how='left')

In [71]:
merged_test_dist = pd.merge(merged_new_test, test_dist, on=['id', 'atom_index_0', 'atom_index_1'], how='left') 

In [72]:
merged_trn_dist.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,...,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_0,dip_cop_1,dip_cop_0_1,dip_cop_plane,dip_cop_axis,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,x_9,y_7,y_8,y_9,z_7,z_8,z_9,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,52.09375,34.46875,0.133911,-0.535645,1.091797,HC,1J,1.091797,0.006702,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,1.358398,0.272949,1.463867,0.37793,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,1.091797,-3e-06,1.0,...,1.091797,1.091797,-7e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,40.71875,17.8125,1,1,1,0,0,1.011719,-0.541016,-0.523926,,,1.463867,1.447266,1.4375,,,0.000277,-0.876465,0.90625,,,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,50.84375,36.0625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.779297,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,1.324219,0.237915,1.447266,0.361816,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,1.091797,-2e-06,1.0,...,1.091797,1.091797,-5e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,38.8125,19.515625,1,1,1,0,0,0.00215,-0.523926,-0.541016,,,-0.006031,1.4375,1.447266,,,0.001976,0.90625,-0.876465,,,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,H,-0.541016,1.447266,-0.876465,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.90625,35.65625,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.776367,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,1.261719,0.176025,1.4375,0.352051,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,1.091797,3e-06,1.0,...,1.091797,1.091797,0.0,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,2.894531,19.0625,1,1,1,0,0,-0.523926,0.00215,1.011719,,,1.4375,-0.006031,1.463867,,,0.90625,0.001976,0.000277,,,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,H,-0.523926,1.4375,0.90625,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,C,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.125,34.875,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.77832,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,1.085938,0.0,1.085938,0.0,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,1.091797,2e-06,1.0,...,1.091797,1.091797,-1e-06,1.0,0,-0.012695,1.085938,1.091797,0.0,0.0,0.0,2.580078,18.234375,1,1,1,0,0,-0.541016,1.011719,0.00215,,,1.447266,1.463867,-0.006031,,,-0.876465,0.000277,0.001976,,,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,H,1.011719,1.463867,0.000277,1,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,0.443848,73.375,0.804199,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.779297,1.007812,1,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,1,1.358398,-0.10498,1.463867,0.0,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,1.783203,0.0,1.0,...,1.783203,1.783203,0.0,1.0,0,-0.012695,1.085938,1.091797,2.166016,2.167969,-0.333252,80.75,0.004707,6,1,1,0,0,-0.012695,-0.523926,-0.541016,,,1.085938,1.4375,1.447266,,,0.008003,0.90625,-0.876465,,,1.783203,1.091797,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,


In [78]:
merged_test_dist.drop('scalar_coupling_constant', axis=1, inplace=True)  # ['scalar_coupling_constant']

In [None]:
# merged_trn_dist.isna().any()

## Scoring Function

In [74]:
groups = merged['type']

def group_lmae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true - y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

## Splitting

In [102]:
trainval_1JHC = merged_trn_dist[merged_trn_dist['type'] == '1JHC']
trainval_1JHN = merged_trn_dist[merged_trn_dist['type'] == '1JHN']
trainval_2JHH = merged_trn_dist[merged_trn_dist['type'] == '2JHH']
trainval_2JHN = merged_trn_dist[merged_trn_dist['type'] == '2JHN']
trainval_2JHC = merged_trn_dist[merged_trn_dist['type'] == '2JHC']
trainval_3JHH = merged_trn_dist[merged_trn_dist['type'] == '3JHH']
trainval_3JHC = merged_trn_dist[merged_trn_dist['type'] == '3JHC']
trainval_3JHN = merged_trn_dist[merged_trn_dist['type'] == '3JHN']

In [103]:
test_1JHC = merged_test_dist[merged_test_dist['type'] == '1JHC']
test_1JHN = merged_test_dist[merged_test_dist['type'] == '1JHN']
test_2JHH = merged_test_dist[merged_test_dist['type'] == '2JHH']
test_2JHN = merged_test_dist[merged_test_dist['type'] == '2JHN']
test_2JHC = merged_test_dist[merged_test_dist['type'] == '2JHC']
test_3JHH = merged_test_dist[merged_test_dist['type'] == '3JHH']
test_3JHC = merged_test_dist[merged_test_dist['type'] == '3JHC']
test_3JHN = merged_test_dist[merged_test_dist['type'] == '3JHN']

In [104]:
def split_col_names(lst):
    return reduce(lambda x,y: x+y, [s.split('\t') for s in lst])

In [105]:
cols_to_del_1JHC = ['dip_cop_0	dip_cop_1	dip_cop_0_1', ]

In [106]:
trainval_1JHC.drop(split_col_names(cols_to_del_1JHC), axis=1, inplace=True)
test_1JHC.drop(split_col_names(cols_to_del_1JHC), axis=1, inplace=True)

In [107]:
# test_dist.drop(['Unnamed: 0', 'molecule_index', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1, inplace=True)

In [108]:
cols_to_del_1JHN = ['dip_cop_0	dip_cop_1	dip_cop_0_1', 'atom_7	atom_8	atom_9	x_7	x_8	x_9	y_7	y_8	y_9	z_7	z_8	z_9	d_7_0	d_7_1	d_7_2	d_7_3	d_8_0	d_8_1	d_8_2	d_8_3	d_9_0	d_9_1	d_9_2	d_9_3']

In [109]:
trainval_1JHN.drop(split_col_names(cols_to_del_1JHN), axis=1, inplace=True)
test_1JHN.drop(split_col_names(cols_to_del_1JHN), axis=1, inplace=True)

In [110]:
cols_to_del_2JHH = ['x_9', 'y_9', 'z_9', 'd_9_0	d_9_1	d_9_2	d_9_3']

In [111]:
trainval_2JHH.drop(split_col_names(cols_to_del_2JHH), axis=1, inplace=True)
test_2JHH.drop(split_col_names(cols_to_del_2JHH), axis=1, inplace=True)

In [112]:
cols_to_del_2JHN = ['dip_cop_0	dip_cop_1	dip_cop_0_1', 'x_9', 'y_9', 'z_9', 'd_9_0	d_9_1	d_9_2	d_9_3']

In [113]:
trainval_2JHN.drop(split_col_names(cols_to_del_2JHN), axis=1, inplace=True)
test_2JHN.drop(split_col_names(cols_to_del_2JHN), axis=1, inplace=True)

In [114]:
cols_to_del_2JHC = ['dip_cop_0	dip_cop_1	dip_cop_0_1', 'x_9', 'y_9', 'z_9', 'd_9_0	d_9_1	d_9_2	d_9_3']

In [115]:
trainval_2JHC.drop(split_col_names(cols_to_del_2JHC), axis=1, inplace=True)
test_2JHC.drop(split_col_names(cols_to_del_2JHC), axis=1, inplace=True)

In [116]:
cols_to_del_3JHH = ['x_9', 'y_9', 'z_9', 'd_9_0	d_9_1	d_9_2	d_9_3']

In [117]:
trainval_3JHH.drop(split_col_names(cols_to_del_3JHH), axis=1, inplace=True)
test_3JHH.drop(split_col_names(cols_to_del_3JHH), axis=1, inplace=True)

In [96]:
cols_to_del_3JHC = ['dip_cop_0	dip_cop_1	dip_cop_0_1',]

In [118]:
trainval_3JHC.drop(split_col_names(cols_to_del_3JHC), axis=1, inplace=True)
test_3JHC.drop(split_col_names(cols_to_del_3JHC), axis=1, inplace=True)

In [119]:
cols_to_del_3JHN = ['dip_cop_0	dip_cop_1	dip_cop_0_1',]

In [120]:
trainval_3JHN.drop(split_col_names(cols_to_del_3JHN), axis=1, inplace=True)
test_3JHN.drop(split_col_names(cols_to_del_3JHN), axis=1, inplace=True)

In [122]:
trainval_1JHC.tail(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,...,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_plane,dip_cop_axis,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,x_9,y_7,y_8,y_9,z_7,z_8,z_9,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
4658063,4658057,dsgdb9nsd_133882,14,5,1JHC,116.125,H,-1.71582,0.056427,2.037109,1.09082,0.134888,0.0,0.098999,0.109375,-0.163818,0.037506,0.210693,0.175781,0.047668,0.215454,0.191772,C,-1.087891,-0.09137,1.158203,4,1.427734,0.044464,0.257812,0.051483,-1.021484,0.899414,0.334229,0.001105,0.045929,0.271729,-0.069458,0.000901,0.767578,0.000191,71.25,0.097961,-0.012283,1.09082,HC,1J,1.09082,2.664062,1.591797,1.007812,1,12.007812,4,0.350098,58,2.619141,1.082031,3.474609,7,5,0.095032,0.186401,1.547852,1.639648,2.650391,1.55957,2.429688,3.431641,2.339844,3.146484,1.09082,2.523438,1.432617,2.314453,...,1.09082,0.0,1.0,2.626953,1.082031,-0.008460999,0.992188,5,-1.087891,-0.09137,1.089844,-0.005001,76.1875,8,6,6,6,1,-1.421875,0.433105,-0.929688,-0.292236,0.847656,-1.317383,-0.393311,0.748047,-0.651855,-1.245117,0.347168,1.021484,-0.143311,-0.321777,1.554688,1.09082,2.199219,1.507812,2.419922,1.556641,2.179688,2.419922,1.556641,2.179688,2.125,2.845703,1.771484,1.47168,1.548828,2.916016,2.289062,2.572266,1.087891,1.0,6.0,6.0,-1.817383,0.344482,1.295898,0.987305,1.547852,0.750977,-0.723145,-0.265137,0.54834,2.916016,2.289062,2.572266,3.164062,3.431641,2.601562,3.421875,2.330078,3.431641,2.601562,3.421875,1.508789
4658069,4658069,dsgdb9nsd_133884,9,0,1JHC,103.625,H,-2.089844,1.407227,1.185547,1.09375,0.138306,0.0,0.068115,-0.203979,0.150757,0.005077,0.202393,0.176147,0.020813,0.208252,0.19397,C,-1.435547,1.363281,0.311035,4,1.320312,-0.199219,0.209595,-0.016083,0.466797,-0.400879,0.275635,-0.24292,-0.253418,0.186035,-0.292236,-0.314697,0.772461,55.625,33.125,0.101868,-0.276367,1.09375,HC,1J,1.09375,2.785156,2.003906,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,7,6,0.685059,-0.678223,2.259766,0.896973,2.371094,1.276367,2.167969,3.601562,2.507812,3.292969,1.09375,2.380859,1.287109,2.177734,...,1.09375,0.0,1.0,2.625,1.081055,-0.01229858,0.98877,0,-1.435547,1.363281,1.09375,46.40625,16.453125,1,7,6,6,1,-1.539062,-1.571289,-0.027069,-0.750977,-1.327148,2.259766,0.047943,0.74707,-0.602051,-0.814453,-0.30542,-0.491699,0.478516,0.666016,1.567383,1.09375,1.804688,1.09375,2.220703,1.546875,2.220703,2.277344,1.546875,2.277344,1.953125,2.46875,2.111328,3.123047,1.560547,2.378906,2.515625,3.605469,2.246094,6.0,1.0,6.0,-0.131958,-0.084534,1.416016,0.356934,1.110352,0.620605,-1.009766,-1.796875,0.938965,3.123047,2.111328,2.46875,1.560547,3.605469,2.515625,2.378906,2.246094,3.601562,3.013672,3.601562,3.361328
4658070,4658076,dsgdb9nsd_133884,10,0,1JHC,103.625,H,-1.539062,2.259766,-0.30542,1.09375,0.138306,0.0,0.068115,-0.203979,0.150757,0.005077,0.202393,0.176147,0.020813,0.208252,0.19397,C,-1.435547,1.363281,0.311035,4,1.320312,-0.199219,0.209595,-0.016083,0.466797,-0.400879,0.275635,-0.24292,-0.253418,0.186035,-0.292236,-0.314697,0.772461,55.625,33.125,0.101868,-0.276367,1.09375,HC,1J,1.09375,2.751953,2.003906,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,6,6,0.422363,-0.940918,1.363281,0.0,2.464844,1.371094,2.253906,3.601562,2.507812,3.292969,1.09375,2.380859,1.287109,2.177734,...,1.09375,-1.788139e-07,1.0,2.625,1.081055,-0.01229858,0.98877,0,-1.435547,1.363281,1.092773,46.40625,16.453125,1,7,6,6,1,-2.089844,-1.571289,-0.027069,-0.131958,-0.084534,1.407227,0.047943,0.74707,0.356934,1.110352,1.185547,-0.491699,0.478516,-1.009766,-1.796875,1.09375,1.804688,1.09375,2.220703,1.546875,2.220703,2.277344,1.546875,2.277344,1.953125,2.46875,2.111328,3.123047,1.560547,2.378906,2.515625,3.605469,2.246094,6.0,1.0,6.0,-0.750977,-1.327148,1.416016,-0.602051,-0.814453,0.620605,0.666016,1.567383,0.938965,3.123047,2.111328,2.46875,1.560547,3.605469,2.515625,2.378906,2.246094,3.601562,3.013672,3.601562,3.361328
4658083,4658084,dsgdb9nsd_133884,11,2,1JHC,99.6875,H,-1.327148,-0.814453,1.567383,1.09082,0.11792,0.0,0.081238,0.008957,-0.036072,0.005108,0.188477,0.153931,0.018967,0.195679,0.17041,C,-0.750977,-0.602051,0.666016,4,1.425781,-0.016022,0.285645,-0.021439,-0.330811,0.280518,0.253174,-0.022583,-0.030777,0.190186,-0.074707,-0.071045,0.772461,69.4375,20.328125,0.067139,0.01004,1.09082,HC,1J,1.09082,2.208984,1.170898,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,9,-0.32373,0.27832,1.363281,1.96582,2.599609,1.509766,2.384766,3.359375,2.267578,3.080078,1.09082,2.794922,1.704102,2.5625,...,1.09082,0.0,1.0,2.625,1.081055,-0.009483337,0.991211,2,-0.750977,-0.602051,1.09082,72.3125,6.195312,6,7,6,6,1,0.326904,-1.571289,-0.027069,-1.435547,-2.089844,-1.553711,0.047943,0.74707,1.363281,1.407227,0.205688,-0.491699,0.478516,0.311035,1.185547,1.09082,2.265625,1.509766,2.246094,1.560547,2.580078,2.304688,1.542969,2.34375,1.953125,2.515625,2.111328,3.410156,1.546875,2.378906,2.46875,3.945312,2.220703,1.0,6.0,6.0,0.299561,-0.131958,1.416016,-2.630859,0.356934,0.620605,0.305664,-1.009766,0.938965,2.744141,2.3125,1.081055,3.363281,3.072266,2.027344,2.310547,1.560547,3.158203,2.503906,2.541016,3.361328
4658093,4658116,dsgdb9nsd_133884,14,5,1JHC,87.875,H,2.087891,1.391602,0.548828,1.094727,0.142944,0.0,0.04245,-0.308838,0.265381,0.007446,0.203491,0.180054,0.025406,0.207764,0.196289,C,1.416016,0.620605,0.938965,4,1.31543,-0.294678,0.094971,-0.036041,0.979004,-0.894531,0.00872,-0.380615,-0.362549,-0.030807,-0.37915,-0.394531,0.772461,54.625,3.597656,0.104858,-0.367676,1.094727,HC,1J,1.094727,2.568359,1.808594,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,9,9,-0.012024,-0.632812,1.363281,0.742676,2.632812,1.537109,2.404297,3.53125,2.4375,3.226562,1.094727,2.777344,1.683594,2.539062,...,1.094727,0.0,1.0,2.625,1.081055,-0.01329803,0.987793,5,1.416016,0.620605,1.094727,44.75,0.189209,1,6,6,1,6,1.542969,-0.027069,1.629883,2.582031,-0.131958,0.547363,0.74707,-0.74707,-1.259766,0.356934,2.023438,0.478516,0.235229,0.293457,-1.009766,1.094727,1.78418,1.094727,2.210938,1.519531,2.210938,2.208984,1.553711,2.208984,2.244141,2.708984,2.304688,2.708984,3.296875,2.902344,2.503906,3.470703,1.542969,6.0,6.0,6.0,0.787598,-0.750977,0.326904,-0.840332,-0.602051,-1.553711,-1.041992,0.666016,0.205688,3.033203,2.541016,3.449219,2.34375,3.470703,2.503906,2.902344,1.542969,3.449219,2.541016,3.033203,2.34375
4658094,4658125,dsgdb9nsd_133884,15,5,1JHC,87.875,H,1.542969,0.547363,2.023438,1.094727,0.142944,0.0,0.04245,-0.308838,0.265381,0.007446,0.203491,0.180054,0.025406,0.207764,0.196289,C,1.416016,0.620605,0.938965,4,1.31543,-0.294678,0.094971,-0.036041,0.979004,-0.894531,0.00872,-0.380615,-0.362549,-0.030807,-0.37915,-0.394531,0.772461,54.625,3.597656,0.104858,-0.367676,1.094727,HC,1J,1.094727,2.603516,1.808594,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,8,9,-0.08197,-0.702637,1.363281,0.742676,2.738281,1.643555,2.501953,3.53125,2.4375,3.226562,1.094727,2.777344,1.683594,2.539062,...,1.094727,-6.556511e-07,1.0,2.625,1.081055,-0.01329803,0.987793,5,1.416016,0.620605,1.09375,44.75,0.189209,1,6,6,1,6,2.087891,-0.027069,1.629883,2.582031,-0.750977,1.391602,0.74707,-0.74707,-1.259766,-0.602051,0.548828,0.478516,0.235229,0.293457,0.666016,1.094727,1.78418,1.094727,2.210938,1.519531,2.210938,2.208984,1.553711,2.208984,2.244141,2.708984,2.304688,2.708984,3.296875,2.902344,2.503906,3.470703,1.542969,6.0,6.0,6.0,0.326904,-0.131958,0.787598,-1.553711,0.356934,-0.840332,0.205688,-1.009766,-1.041992,3.033203,2.541016,3.449219,2.34375,3.470703,2.503906,2.902344,1.542969,3.449219,2.541016,3.033203,2.34375
4658113,4658136,dsgdb9nsd_133884,16,7,1JHC,99.6875,H,-0.084534,1.110352,-1.796875,1.09082,0.11792,0.0,0.081238,0.008965,-0.036072,0.005108,0.188477,0.153931,0.018967,0.195679,0.17041,C,-0.131958,0.356934,-1.009766,4,1.425781,-0.016022,0.285645,-0.021439,-0.330811,0.280518,0.253174,-0.022583,-0.030777,0.190186,-0.074707,-0.071045,0.772461,69.4375,20.328125,0.067139,0.01004,1.09082,HC,1J,1.09082,2.113281,1.079102,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,9,-0.195679,-0.552734,1.363281,1.005859,2.599609,1.509766,2.384766,3.359375,2.267578,3.080078,1.09082,2.794922,1.704102,2.5625,...,1.09082,0.0,1.0,2.625,1.081055,-0.009483337,0.991211,7,-0.131958,0.356934,1.09082,72.3125,6.195312,6,7,6,6,1,0.787598,-1.571289,-0.027069,-1.435547,-1.539062,-0.840332,0.047943,0.74707,1.363281,2.259766,-1.041992,-0.491699,0.478516,0.311035,-0.30542,1.09082,2.265625,1.509766,2.246094,1.560547,2.580078,2.304688,1.542969,2.34375,1.953125,2.515625,2.111328,3.410156,1.546875,2.378906,2.46875,3.945312,2.220703,1.0,6.0,6.0,1.126953,-0.750977,1.416016,-1.348633,-0.602051,0.620605,-1.933594,0.666016,0.938965,2.744141,2.3125,1.081055,3.363281,3.072266,2.027344,2.310547,1.560547,3.158203,2.503906,2.541016,3.361328
4658122,4658094,dsgdb9nsd_133884,12,3,1JHC,117.9375,H,0.299561,-2.630859,0.305664,1.081055,0.124573,0.099976,0.065552,-0.168091,0.125122,0.019287,0.192017,0.160034,0.033417,0.196045,0.173584,C,0.326904,-1.553711,0.205688,4,1.408203,-0.123352,-0.176025,-0.045807,0.349609,-0.296875,0.011826,-0.178467,-0.156372,-0.011955,-0.184814,-0.170898,0.772461,33.71875,36.53125,0.090759,-0.097168,1.081055,HC,1J,1.081055,2.664062,1.601562,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,7,-0.458008,1.09668,0.74707,2.300781,2.691406,1.610352,2.490234,3.496094,2.414062,3.232422,1.081055,2.537109,1.456055,2.347656,...,1.081055,0.0,1.0,2.625,1.081055,0.0,1.0,3,0.326904,-1.553711,1.082031,17.0625,20.015625,6,6,6,1,1,0.787598,-0.750977,1.629883,2.582031,1.126953,-0.840332,-0.602051,-0.74707,-1.259766,-1.348633,-1.041992,0.666016,0.235229,0.293457,-1.933594,1.081055,2.292969,1.509766,2.3125,1.509766,2.310547,2.306641,1.533203,1.533203,2.423828,2.662109,2.275391,2.275391,3.417969,2.708984,2.292969,1.081055,3.292969,1.0,6.0,6.0,-1.327148,-0.131958,-0.027069,-0.814453,0.356934,0.74707,1.567383,-1.009766,0.478516,2.744141,2.265625,3.359375,1.09082,3.292969,2.310547,1.509766,2.027344,3.396484,2.34375,2.34375,1.542969
4658131,4658104,dsgdb9nsd_133884,13,4,1JHC,107.0,H,2.582031,-1.259766,0.293457,1.083008,0.116272,0.099976,0.055267,-0.030075,-0.005348,0.010818,0.185059,0.151123,0.025406,0.190063,0.165283,C,1.629883,-0.74707,0.235229,4,1.425781,-0.099915,-0.194946,-0.056671,-0.265625,0.299805,0.004776,-0.173706,-0.134644,-0.013321,-0.184692,-0.150513,0.772461,0.000368,77.375,0.087646,-0.064209,1.083008,HC,1J,1.083008,2.888672,1.808594,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,7,-0.142822,0.604492,1.391602,2.138672,2.615234,1.532227,2.414062,3.417969,2.333984,3.154297,1.083008,2.371094,1.288086,2.189453,...,1.083008,0.0,1.0,2.625,1.081055,-0.00242424,0.997559,4,1.629883,-0.74707,1.083008,-0.005001,89.8125,6,6,6,1,1,0.787598,0.326904,1.416016,1.542969,2.087891,-0.840332,-1.553711,0.620605,0.547363,1.391602,-1.041992,0.205688,0.938965,2.023438,0.548828,1.083008,2.275391,1.533203,2.275391,1.533203,1.509766,2.304688,1.553711,2.541016,2.541016,2.708984,2.208984,3.449219,3.033203,2.708984,2.208984,3.033203,3.449219,1.0,1.0,6.0,0.299561,1.126953,-0.027069,-2.630859,-1.348633,0.74707,0.305664,-1.933594,0.478516,2.662109,2.306641,2.292969,1.081055,2.662109,2.306641,1.081055,2.292969,3.296875,2.244141,2.34375,2.34375
4658137,4658146,dsgdb9nsd_133884,17,8,1JHC,117.9375,H,1.126953,-1.348633,-1.933594,1.081055,0.124573,0.099976,0.065552,-0.168091,0.125122,0.019287,0.192017,0.160034,0.033417,0.196045,0.173584,C,0.787598,-0.840332,-1.041992,4,1.408203,-0.123352,-0.176025,-0.045807,0.349609,-0.296875,0.011826,-0.178467,-0.156372,-0.011955,-0.184814,-0.170898,0.772461,33.71875,36.53125,0.090759,-0.097168,1.081055,HC,1J,1.081055,2.613281,1.552734,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,8,7,-0.246338,0.59375,0.74707,1.586914,2.693359,1.611328,2.490234,3.496094,2.414062,3.232422,1.081055,2.537109,1.456055,2.347656,...,1.081055,0.0,1.0,2.625,1.081055,-1.192093e-07,1.0,8,0.787598,-0.840332,1.081055,17.0625,20.015625,6,6,6,1,1,0.326904,-0.131958,1.629883,2.582031,0.299561,-1.553711,0.356934,-0.74707,-1.259766,-2.630859,0.205688,-1.009766,0.235229,0.293457,0.305664,1.081055,2.292969,1.509766,2.3125,1.509766,2.310547,2.306641,1.533203,1.533203,2.423828,2.662109,2.275391,2.275391,3.417969,2.708984,2.292969,1.081055,3.292969,1.0,6.0,6.0,-0.084534,-0.750977,-0.027069,1.110352,-0.602051,0.74707,-1.796875,0.666016,0.478516,2.744141,2.265625,3.359375,1.09082,3.292969,2.310547,1.509766,2.027344,3.396484,2.34375,2.34375,1.542969


In [123]:
from sklearn.model_selection import train_test_split

In [124]:
train_1JHC, val_1JHC = train_test_split(trainval_1JHC, shuffle=False, random_state=47)

train_1JHC_molecules = train_1JHC['molecule_name'].unique()
val_1JHC_molecules = np.delete(val_1JHC['molecule_name'].unique(), 0)

train_1JHC = train_1JHC[train_1JHC['molecule_name'].isin(train_1JHC_molecules)]
val_1JHC = val_1JHC[val_1JHC['molecule_name'].isin(val_1JHC_molecules)]

###################################

train_1JHN, val_1JHN = train_test_split(trainval_1JHN, shuffle=False, random_state=47)

train_1JHN_molecules = train_1JHN['molecule_name'].unique()
val_1JHN_molecules = np.delete(val_1JHN['molecule_name'].unique(), 0)

train_1JHN = train_1JHN[train_1JHN['molecule_name'].isin(train_1JHN_molecules)]
val_1JHN = val_1JHN[val_1JHN['molecule_name'].isin(val_1JHN_molecules)]

#####################################

train_2JHH, val_2JHH = train_test_split(trainval_2JHH, shuffle=False, random_state=47)

train_2JHH_molecules = train_2JHH['molecule_name'].unique()
val_2JHH_molecules = np.delete(val_2JHH['molecule_name'].unique(), 0)

train_2JHH = train_2JHH[train_2JHH['molecule_name'].isin(train_2JHH_molecules)]
val_2JHH = val_2JHH[val_2JHH['molecule_name'].isin(val_2JHH_molecules)]

###################################

train_2JHN, val_2JHN = train_test_split(trainval_2JHN, shuffle=False, random_state=47)

train_2JHN_molecules = train_2JHN['molecule_name'].unique()
val_2JHN_molecules = np.delete(val_2JHN['molecule_name'].unique(), 0)

train_2JHN = train_2JHN[train_2JHN['molecule_name'].isin(train_2JHN_molecules)]
val_2JHN = val_2JHN[val_2JHN['molecule_name'].isin(val_2JHN_molecules)]

####################################

train_2JHC, val_2JHC = train_test_split(trainval_2JHC, shuffle=False, random_state=47)

train_2JHC_molecules = train_2JHC['molecule_name'].unique()
val_2JHC_molecules = np.delete(val_2JHC['molecule_name'].unique(), 0)

train_2JHC = train_2JHC[train_2JHC['molecule_name'].isin(train_2JHC_molecules)]
val_2JHC = val_2JHC[val_2JHC['molecule_name'].isin(val_2JHC_molecules)]

####################################

train_3JHH, val_3JHH = train_test_split(trainval_3JHH, shuffle=False, random_state=47)

train_3JHH_molecules = train_3JHH['molecule_name'].unique()
val_3JHH_molecules = np.delete(val_3JHH['molecule_name'].unique(), 0)

train_3JHH = train_3JHH[train_3JHH['molecule_name'].isin(train_3JHH_molecules)]
val_3JHH = val_3JHH[val_3JHH['molecule_name'].isin(val_3JHH_molecules)]

#######################################

train_3JHC, val_3JHC = train_test_split(trainval_3JHC, shuffle=False, random_state=47)

train_3JHC_molecules = train_3JHC['molecule_name'].unique()
val_3JHC_molecules = np.delete(val_3JHC['molecule_name'].unique(), 0)

train_3JHC = train_3JHC[train_3JHC['molecule_name'].isin(train_3JHC_molecules)]
val_3JHC = val_3JHC[val_3JHC['molecule_name'].isin(val_3JHC_molecules)]

#######################################

train_3JHN, val_3JHN = train_test_split(trainval_3JHN, shuffle=False, random_state=47)

train_3JHN_molecules = train_3JHN['molecule_name'].unique()
val_3JHN_molecules = np.delete(val_3JHN['molecule_name'].unique(), 0)

train_3JHN = train_3JHN[train_3JHN['molecule_name'].isin(train_3JHN_molecules)]
val_3JHN = val_3JHN[val_3JHN['molecule_name'].isin(val_3JHN_molecules)]

#########################################

In [125]:
target = 'scalar_coupling_constant'

In [126]:
# split data
y_train_1JHC = train_1JHC[target]
y_train_1JHN = train_1JHN[target]
y_train_2JHH = train_2JHH[target]
y_train_2JHN = train_2JHN[target]
y_train_2JHC = train_2JHC[target]
y_train_3JHH = train_3JHH[target]
y_train_3JHC = train_3JHC[target]
y_train_3JHN = train_3JHN[target]

y_val_1JHC = val_1JHC[target]
y_val_1JHN = val_1JHN[target]
y_val_2JHH = val_2JHH[target]
y_val_2JHN = val_2JHN[target]
y_val_2JHC = val_2JHC[target]
y_val_3JHH = val_3JHH[target]
y_val_3JHC = val_3JHC[target]
y_val_3JHN = val_3JHN[target]

X_train_1JHC = train_1JHC.drop([target,], axis=1).fillna(0)
X_train_1JHN = train_1JHN.drop([target,], axis=1).fillna(0)
X_train_2JHH = train_2JHH.drop([target,], axis=1).fillna(0)
X_train_2JHN = train_2JHN.drop([target,], axis=1).fillna(0)
X_train_2JHC = train_2JHC.drop([target,], axis=1).fillna(0)
X_train_3JHH = train_3JHH.drop([target,], axis=1).fillna(0)
X_train_3JHC = train_3JHC.drop([target,], axis=1).fillna(0)
X_train_3JHN = train_3JHN.drop([target,], axis=1).fillna(0)

X_val_1JHC = val_1JHC.drop([target,], axis=1).fillna(0)
X_val_1JHN = val_1JHN.drop([target,], axis=1).fillna(0)
X_val_2JHH = val_2JHH.drop([target,], axis=1).fillna(0)
X_val_2JHN = val_2JHN.drop([target,], axis=1).fillna(0)
X_val_2JHC = val_2JHC.drop([target,], axis=1).fillna(0)
X_val_3JHH = val_3JHH.drop([target,], axis=1).fillna(0)
X_val_3JHC = val_3JHC.drop([target,], axis=1).fillna(0)
X_val_3JHN = val_3JHN.drop([target,], axis=1).fillna(0)

In [127]:
test_1JHC = test_1JHC.fillna(0)
test_1JHN = test_1JHN.fillna(0)
test_2JHH = test_2JHH.fillna(0)
test_2JHN = test_2JHN.fillna(0)
test_2JHC = test_2JHC.fillna(0)
test_3JHH = test_3JHH.fillna(0)
test_3JHC = test_3JHC.fillna(0)
test_3JHN = test_3JHN.fillna(0)

In [128]:
print(len(y_train_1JHC) - len(X_train_1JHC))

0


## Encoding

In [129]:
def encode(df):
  df = OrdinalEncoder.fit_transform(OrdinalEncoder(df), df)
  return df

def imp(df):
  df = SimpleImputer.fit_transform(SimpleImputer(df), df)
  return df

In [130]:
X_train_1JHC_encoded = encode(X_train_1JHC)
X_train_1JHN_encoded = encode(X_train_1JHN)
X_train_2JHH_encoded = encode(X_train_2JHH)
X_train_2JHC_encoded = encode(X_train_2JHC)
X_train_2JHN_encoded = encode(X_train_2JHN)
X_train_3JHH_encoded = encode(X_train_3JHH)
X_train_3JHC_encoded = encode(X_train_3JHC)
X_train_3JHN_encoded = encode(X_train_3JHN)

X_val_1JHC_encoded = encode(X_val_1JHC)
X_val_1JHN_encoded = encode(X_val_1JHN)
X_val_2JHH_encoded = encode(X_val_2JHH)
X_val_2JHN_encoded = encode(X_val_2JHN)
X_val_2JHC_encoded = encode(X_val_2JHC)
X_val_3JHH_encoded = encode(X_val_3JHH)
X_val_3JHC_encoded = encode(X_val_3JHC)
X_val_3JHN_encoded = encode(X_val_3JHN)

test_1JHC_encoded = encode(test_1JHC)
test_1JHN_encoded = encode(test_1JHN)
test_2JHH_encoded = encode(test_2JHH)
test_2JHN_encoded = encode(test_2JHN)
test_2JHC_encoded = encode(test_2JHC)
test_3JHH_encoded = encode(test_3JHH)
test_3JHC_encoded = encode(test_3JHC)
test_3JHN_encoded = encode(test_3JHN)

#full data
X_trainval_1JHC_encoded = encode(trainval_1JHC.drop([target,], axis=1).fillna(0))
X_trainval_1JHN_encoded = encode(trainval_1JHN.drop([target,], axis=1).fillna(0))
X_trainval_2JHH_encoded = encode(trainval_2JHH.drop([target,], axis=1).fillna(0))
X_trainval_2JHC_encoded = encode(trainval_2JHC.drop([target,], axis=1).fillna(0))
X_trainval_2JHN_encoded = encode(trainval_2JHN.drop([target,], axis=1).fillna(0))
X_trainval_3JHH_encoded = encode(trainval_3JHH.drop([target,], axis=1).fillna(0))
X_trainval_3JHC_encoded = encode(trainval_3JHC.drop([target,], axis=1).fillna(0))
X_trainval_3JHN_encoded = encode(trainval_3JHN.drop([target,], axis=1).fillna(0))

In [131]:
y_1JHC = trainval_1JHC[target]
y_1JHN = trainval_1JHN[target]
y_2JHH = trainval_2JHH[target]
y_2JHN = trainval_2JHN[target]
y_2JHC = trainval_2JHC[target]
y_3JHH = trainval_3JHH[target]
y_3JHC = trainval_3JHC[target]
y_3JHN = trainval_3JHN[target]

In [132]:
X_train_1JHC_encoded.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,...,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_plane,dip_cop_axis,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,x_9,y_7,y_8,y_9,z_7,z_8,z_9,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
0,0,1,1,0,1,1,0.00215,-0.006031,0.001976,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,1,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,52.09375,34.46875,0.133911,-0.535645,1.091797,1,1,1.091797,0.006702,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,1.358398,0.272949,1.463867,0.37793,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,1.091797,-3e-06,1.0,1.091797,...,1.091797,-7e-06,1.0,1.091797,1.091797,-7e-06,1.0,0,-0.012695,1.085938,1.091797,40.71875,17.8125,1,1,1,0,0,1.011719,-0.541016,-0.523926,0.0,0.0,1.463867,1.447266,1.4375,0.0,0.0,0.000277,-0.876465,0.90625,0.0,0.0,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,1,2,0,1,1,1.011719,1.463867,0.000277,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,1,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,50.84375,36.0625,0.133911,-0.535645,1.091797,1,1,1.091797,1.779297,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,1.324219,0.237915,1.447266,0.361816,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,1.091797,-2e-06,1.0,1.091797,...,1.091797,-5e-06,1.0,1.091797,1.091797,-5e-06,1.0,0,-0.012695,1.085938,1.091797,38.8125,19.515625,1,1,1,0,0,0.00215,-0.523926,-0.541016,0.0,0.0,-0.006031,1.4375,1.447266,0.0,0.0,0.001976,0.90625,-0.876465,0.0,0.0,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,3,0,1,1,-0.541016,1.447266,-0.876465,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,1,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.90625,35.65625,0.133911,-0.535645,1.091797,1,1,1.091797,1.776367,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,1.261719,0.176025,1.4375,0.352051,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,1.091797,3e-06,1.0,1.091797,...,1.091797,0.0,1.0,1.091797,1.091797,0.0,1.0,0,-0.012695,1.085938,1.091797,2.894531,19.0625,1,1,1,0,0,-0.523926,0.00215,1.011719,0.0,0.0,1.4375,-0.006031,1.463867,0.0,0.0,0.90625,0.001976,0.000277,0.0,0.0,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,1,4,0,1,1,-0.523926,1.4375,0.90625,1.091797,0.161133,0.0,0.019394,-0.812988,0.773438,-0.003651,0.203247,0.196289,0.01683,0.201538,0.212769,1,-0.012695,1.085938,0.008003,4,1.091797,-0.644531,0.0,-0.077576,3.251953,-3.09375,0.014603,-0.812988,-0.785156,-0.067322,-0.806152,-0.851074,0.443848,13.125,34.875,0.133911,-0.535645,1.091797,1,1,1.091797,1.77832,1.085938,1.007812,1,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,1.085938,0.0,1.085938,0.0,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,1.091797,2e-06,1.0,1.091797,...,1.091797,-1e-06,1.0,1.091797,1.091797,-1e-06,1.0,0,-0.012695,1.085938,1.091797,2.580078,18.234375,1,1,1,0,0,-0.541016,1.011719,0.00215,0.0,0.0,1.447266,1.463867,-0.006031,0.0,0.0,-0.876465,0.000277,0.001976,0.0,0.0,1.091797,1.783203,1.091797,1.783203,1.091797,1.783203,1.783203,1.091797,1.783203,1.783203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,17,2,2,0,1,1,-0.027802,2.199219,0.014153,1.066406,0.141846,0.177002,0.376953,0.025894,-0.097473,0.071045,0.259033,0.136963,0.052917,0.251953,0.144287,1,-0.013321,1.132812,0.008278,2,1.109375,0.141846,0.380127,0.066223,-0.450439,0.366699,0.884766,-0.130005,0.123596,0.727539,-0.220947,0.09137,0.0,0.0,0.0,0.238159,-0.049652,1.066406,1,1,1.066406,2.199219,1.132812,1.007812,1,12.007812,4,0.350098,2,1.642578,1.066406,2.21875,2,1,0.556641,-0.575684,1.132812,0.0,1.642578,0.575684,1.540039,2.21875,1.151367,2.080078,1.066406,1.066406,0.0,1.0,1.066406,...,1.066406,0.0,1.0,1.066406,1.066406,0.0,1.0,0,-0.013321,1.132812,1.066406,0.0,0.0,7,0,0,0,0,0.00231,0.0,0.0,0.0,0.0,-0.019165,0.0,0.0,0.0,0.0,0.001928,0.0,0.0,0.0,0.0,1.066406,2.21875,1.151367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
X_trainval_1JHC_encoded.tail()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,...,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_plane,dip_cop_axis,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,x_9,y_7,y_8,y_9,z_7,z_8,z_9,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
4658094,4658125,84747,15,5,1,1,1.542969,0.547363,2.023438,1.094727,0.142944,0.0,0.04245,-0.308838,0.265381,0.007446,0.203491,0.180054,0.025406,0.207764,0.196289,1,1.416016,0.620605,0.938965,4,1.31543,-0.294678,0.094971,-0.036041,0.979004,-0.894531,0.00872,-0.380615,-0.362549,-0.030807,-0.37915,-0.394531,0.772461,54.625,3.597656,0.104858,-0.367676,1.094727,1,1,1.094727,2.603516,1.808594,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,8,9,-0.08197,-0.702637,1.363281,0.742676,2.738281,1.643555,2.501953,3.53125,2.4375,3.226562,1.094727,2.777344,1.683594,2.539062,3.601562,...,1.094727,-6.556511e-07,1.0,2.625,1.081055,-0.01329803,0.987793,5,1.416016,0.620605,1.09375,44.75,0.189209,1,6,6,1,6,2.087891,-0.027069,1.629883,2.582031,-0.750977,1.391602,0.74707,-0.74707,-1.259766,-0.602051,0.548828,0.478516,0.235229,0.293457,0.666016,1.094727,1.78418,1.094727,2.210938,1.519531,2.210938,2.208984,1.553711,2.208984,2.244141,2.708984,2.304688,2.708984,3.296875,2.902344,2.503906,3.470703,1.542969,6.0,6.0,6.0,0.326904,-0.131958,0.787598,-1.553711,0.356934,-0.840332,0.205688,-1.009766,-1.041992,3.033203,2.541016,3.449219,2.34375,3.470703,2.503906,2.902344,1.542969,3.449219,2.541016,3.033203,2.34375
4658113,4658136,84747,16,7,1,1,-0.084534,1.110352,-1.796875,1.09082,0.11792,0.0,0.081238,0.008965,-0.036072,0.005108,0.188477,0.153931,0.018967,0.195679,0.17041,1,-0.131958,0.356934,-1.009766,4,1.425781,-0.016022,0.285645,-0.021439,-0.330811,0.280518,0.253174,-0.022583,-0.030777,0.190186,-0.074707,-0.071045,0.772461,69.4375,20.328125,0.067139,0.01004,1.09082,1,1,1.09082,2.113281,1.079102,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,9,-0.195679,-0.552734,1.363281,1.005859,2.599609,1.509766,2.384766,3.359375,2.267578,3.080078,1.09082,2.794922,1.704102,2.5625,3.470703,...,1.09082,0.0,1.0,2.625,1.081055,-0.009483337,0.991211,7,-0.131958,0.356934,1.09082,72.3125,6.195312,6,7,6,6,1,0.787598,-1.571289,-0.027069,-1.435547,-1.539062,-0.840332,0.047943,0.74707,1.363281,2.259766,-1.041992,-0.491699,0.478516,0.311035,-0.30542,1.09082,2.265625,1.509766,2.246094,1.560547,2.580078,2.304688,1.542969,2.34375,1.953125,2.515625,2.111328,3.410156,1.546875,2.378906,2.46875,3.945312,2.220703,1.0,6.0,6.0,1.126953,-0.750977,1.416016,-1.348633,-0.602051,0.620605,-1.933594,0.666016,0.938965,2.744141,2.3125,1.081055,3.363281,3.072266,2.027344,2.310547,1.560547,3.158203,2.503906,2.541016,3.361328
4658122,4658094,84747,12,3,1,1,0.299561,-2.630859,0.305664,1.081055,0.124573,0.099976,0.065552,-0.168091,0.125122,0.019287,0.192017,0.160034,0.033417,0.196045,0.173584,1,0.326904,-1.553711,0.205688,4,1.408203,-0.123352,-0.176025,-0.045807,0.349609,-0.296875,0.011826,-0.178467,-0.156372,-0.011955,-0.184814,-0.170898,0.772461,33.71875,36.53125,0.090759,-0.097168,1.081055,1,1,1.081055,2.664062,1.601562,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,7,-0.458008,1.09668,0.74707,2.300781,2.691406,1.610352,2.490234,3.496094,2.414062,3.232422,1.081055,2.537109,1.456055,2.347656,3.449219,...,1.081055,0.0,1.0,2.625,1.081055,0.0,1.0,3,0.326904,-1.553711,1.082031,17.0625,20.015625,6,6,6,1,1,0.787598,-0.750977,1.629883,2.582031,1.126953,-0.840332,-0.602051,-0.74707,-1.259766,-1.348633,-1.041992,0.666016,0.235229,0.293457,-1.933594,1.081055,2.292969,1.509766,2.3125,1.509766,2.310547,2.306641,1.533203,1.533203,2.423828,2.662109,2.275391,2.275391,3.417969,2.708984,2.292969,1.081055,3.292969,1.0,6.0,6.0,-1.327148,-0.131958,-0.027069,-0.814453,0.356934,0.74707,1.567383,-1.009766,0.478516,2.744141,2.265625,3.359375,1.09082,3.292969,2.310547,1.509766,2.027344,3.396484,2.34375,2.34375,1.542969
4658131,4658104,84747,13,4,1,1,2.582031,-1.259766,0.293457,1.083008,0.116272,0.099976,0.055267,-0.030075,-0.005348,0.010818,0.185059,0.151123,0.025406,0.190063,0.165283,1,1.629883,-0.74707,0.235229,4,1.425781,-0.099915,-0.194946,-0.056671,-0.265625,0.299805,0.004776,-0.173706,-0.134644,-0.013321,-0.184692,-0.150513,0.772461,0.000368,77.375,0.087646,-0.064209,1.083008,1,1,1.083008,2.888672,1.808594,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,10,7,-0.142822,0.604492,1.391602,2.138672,2.615234,1.532227,2.414062,3.417969,2.333984,3.154297,1.083008,2.371094,1.288086,2.189453,3.244141,...,1.083008,0.0,1.0,2.625,1.081055,-0.00242424,0.997559,4,1.629883,-0.74707,1.083008,-0.005001,89.8125,6,6,6,1,1,0.787598,0.326904,1.416016,1.542969,2.087891,-0.840332,-1.553711,0.620605,0.547363,1.391602,-1.041992,0.205688,0.938965,2.023438,0.548828,1.083008,2.275391,1.533203,2.275391,1.533203,1.509766,2.304688,1.553711,2.541016,2.541016,2.708984,2.208984,3.449219,3.033203,2.708984,2.208984,3.033203,3.449219,1.0,1.0,6.0,0.299561,1.126953,-0.027069,-2.630859,-1.348633,0.74707,0.305664,-1.933594,0.478516,2.662109,2.306641,2.292969,1.081055,2.662109,2.306641,1.081055,2.292969,3.296875,2.244141,2.34375,2.34375
4658137,4658146,84747,17,8,1,1,1.126953,-1.348633,-1.933594,1.081055,0.124573,0.099976,0.065552,-0.168091,0.125122,0.019287,0.192017,0.160034,0.033417,0.196045,0.173584,1,0.787598,-0.840332,-1.041992,4,1.408203,-0.123352,-0.176025,-0.045807,0.349609,-0.296875,0.011826,-0.178467,-0.156372,-0.011955,-0.184814,-0.170898,0.772461,33.71875,36.53125,0.090759,-0.097168,1.081055,1,1,1.081055,2.613281,1.552734,1.007812,1,12.007812,4,0.350098,78,2.609375,1.081055,3.601562,8,7,-0.246338,0.59375,0.74707,1.586914,2.693359,1.611328,2.490234,3.496094,2.414062,3.232422,1.081055,2.537109,1.456055,2.347656,3.449219,...,1.081055,0.0,1.0,2.625,1.081055,-1.192093e-07,1.0,8,0.787598,-0.840332,1.081055,17.0625,20.015625,6,6,6,1,1,0.326904,-0.131958,1.629883,2.582031,0.299561,-1.553711,0.356934,-0.74707,-1.259766,-2.630859,0.205688,-1.009766,0.235229,0.293457,0.305664,1.081055,2.292969,1.509766,2.3125,1.509766,2.310547,2.306641,1.533203,1.533203,2.423828,2.662109,2.275391,2.275391,3.417969,2.708984,2.292969,1.081055,3.292969,1.0,6.0,6.0,-0.084534,-0.750977,-0.027069,1.110352,-0.602051,0.74707,-1.796875,0.666016,0.478516,2.744141,2.265625,3.359375,1.09082,3.292969,2.310547,1.509766,2.027344,3.396484,2.34375,2.34375,1.542969


In [134]:
xgb_model_1JHC = XGBRegressor(nthread=7)
# xgb_model = XGBRegressor(gpu_id=0, max_bin=16, tree_method='gpu_hist')

In [135]:
xgb_model_1JHC.fit(X_train_1JHC_encoded, y_train_1JHC)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=7, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [147]:
# xgb.plot_importance(xgb_model_1JHC)
# # plt['figsize'] = [30, 50]
# plt.show()

In [181]:
len(xgb_model_1JHC.feature_importances_)

152

In [182]:
len(X_train_1JHC_encoded.columns)

152

In [223]:
print(dict((zip(list(X_train_1JHC_encoded.columns), list(xgb_model_1JHC.feature_importances_)))))

{'id': 0.0, 'molecule_name': 0.0, 'atom_index_0': 0.0, 'atom_index_1': 0.0, 'type': 0.0, 'atom_0': 0.0, 'x_0': 0.0, 'y_0': 0.0, 'z_0': 0.0, 'bond_lengths_mean_x': 0.058908045, 'eem_0': 0.0, 'mmff94_0': 0.024425287, 'gasteiger_0': 0.041666668, 'qeq_0': 0.0, 'qtpie_0': 0.0028735632, 'eem2015ha_0': 0.0, 'eem2015hm_0': 0.0057471264, 'eem2015hn_0': 0.0, 'eem2015ba_0': 0.0, 'eem2015bm_0': 0.033045977, 'eem2015bn_0': 0.0, 'atom_1': 0.0, 'x_1': 0.0, 'y_1': 0.0, 'z_1': 0.0, 'n_bonds_y': 0.0, 'bond_lengths_mean_y': 0.024425287, 'eem_1': 0.004310345, 'mmff94_1': 0.050287355, 'gasteiger_1': 0.012931035, 'qeq_1': 0.02586207, 'qtpie_1': 0.007183908, 'eem2015ha_1': 0.012931035, 'eem2015hm_1': 0.007183908, 'eem2015hn_1': 0.0028735632, 'eem2015ba_1': 0.007183908, 'eem2015bm_1': 0.0028735632, 'eem2015bn_1': 0.010057472, 'flatness_metric': 0.0, 'bond_angle_plane': 0.0, 'bond_angle_axis': 0.004310345, 'mulliken_charge_0': 0.07183908, 'mulliken_charge_1': 0.004310345, 'dist_x': 0.01724138, 'bond': 0.0, 'j_

In [184]:
imp_1JHC_mean = np.mean(xgb_model_1JHC.feature_importances_); imp_1JHC_mean

0.0065789474

In [221]:
# tmp_cols = [_[0] for _ in list(zip(list(X_train_1JHC_encoded.columns), list(xgb_model_1JHC.feature_importances_))) if _[1] > imp_1JHC_mean]
tmp_cols = [_[0] for _ in tuple(zip(list(X_train_1JHC_encoded.columns), list(xgb_model_1JHC.feature_importances_))) if _[1] > imp_1JHC_mean/2]

In [217]:
len(tmp_cols)

52

In [222]:
tmp_cols

['bond_lengths_mean_x',
 'mmff94_0',
 'gasteiger_0',
 'eem2015hm_0',
 'eem2015bm_0',
 'bond_lengths_mean_y',
 'eem_1',
 'mmff94_1',
 'gasteiger_1',
 'qeq_1',
 'qtpie_1',
 'eem2015ha_1',
 'eem2015hm_1',
 'eem2015ba_1',
 'eem2015bn_1',
 'bond_angle_axis',
 'mulliken_charge_0',
 'mulliken_charge_1',
 'dist_x',
 'molecule_dist_max',
 'molecule_atom_index_0_dist_mean',
 'molecule_atom_index_0_dist_max',
 'molecule_atom_index_0_dist_max_diff',
 'molecule_atom_index_0_dist_max_div',
 'molecule_atom_index_1_dist_max',
 'molecule_atom_index_1_dist_max_div',
 'molecule_atom_index_1_dist_min',
 'molecule_atom_index_1_dist_min_diff',
 'atom_3',
 'atom_5',
 'atom_6',
 'd_2_0',
 'd_2_1',
 'd_3_0',
 'd_3_1',
 'd_3_2',
 'd_4_0',
 'd_4_1',
 'd_4_2',
 'd_4_3',
 'd_5_0',
 'd_5_1',
 'd_5_2',
 'd_6_0',
 'd_6_1',
 'atom_9',
 'd_7_0',
 'd_7_2',
 'd_8_0',
 'd_9_0',
 'd_9_1',
 'd_9_2']

In [137]:
xgb_model_1JHC = XGBRegressor(nthread=8)
xgb_model_1JHN = XGBRegressor(nthread=8)
xgb_model_2JHH = XGBRegressor(nthread=8)
xgb_model_2JHN = XGBRegressor(nthread=8)
xgb_model_2JHC = XGBRegressor(nthread=8)
xgb_model_3JHH = XGBRegressor(nthread=8)
xgb_model_3JHN = XGBRegressor(nthread=8)
xgb_model_3JHC = XGBRegressor(nthread=8)

In [138]:
# xgb_model_1JHN.fit(X_train_1JHN_encoded, y_train_1JHN)
xgb_model_1JHC.fit(X_trainval_1JHC_encoded, y_1JHC)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [139]:
xgb_model_1JHN.fit(X_trainval_1JHN_encoded, y_1JHN)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [140]:
xgb_model_2JHH.fit(X_trainval_2JHH_encoded, y_2JHH)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [141]:
xgb_model_2JHN.fit(X_trainval_2JHN_encoded, y_2JHN)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [142]:
xgb_model_2JHC.fit(X_trainval_2JHC_encoded, y_2JHC)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [143]:
xgb_model_3JHH.fit(X_trainval_3JHH_encoded, y_3JHH)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [144]:
xgb_model_3JHN.fit(X_trainval_3JHN_encoded, y_3JHN)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [145]:
xgb_model_3JHC.fit(X_trainval_3JHC_encoded, y_3JHC)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=8, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [146]:
test_2JHH.tail()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,bond_lengths_mean_x,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_1,x_1,y_1,z_1,n_bonds_y,bond_lengths_mean_y,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,flatness_metric,bond_angle_plane,bond_angle_axis,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atomic_mass_0,valence_electrons_0,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,atom_index_closest_0,x_closest_0,y_closest_0,distance_0,dip_cop_0,dip_cop_1,dip_cop_0_1,dip_cop_plane,dip_cop_axis,atom_2,atom_3,atom_4,atom_5,atom_6,x_2,x_3,x_4,x_5,x_6,y_2,y_3,y_4,y_5,y_6,z_2,z_3,z_4,z_5,z_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,atom_7,atom_8,atom_9,x_7,x_8,y_7,y_8,z_7,z_8,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3
2505417,7163562,dsgdb9nsd_133872,16,17,2JHH,H,-1.770508,-1.713867,-0.225098,1.091797,0.160156,0.0,0.048126,-0.510254,0.430908,0.038788,0.218262,0.200317,0.054688,0.219116,0.214722,H,-1.235352,-1.703125,1.491211,1,1.091797,0.156006,0.0,0.048126,-0.508789,0.442627,0.03186,0.216187,0.194824,0.047119,0.217651,0.209229,0.753906,57.5625,12.554688,0.111084,0.111328,1.797852,HH,2J,1.797852,2.474609,2.578125,1.007812,1,1.007812,1,0.0,67,2.513672,1.088867,3.601562,8,2,-0.327393,1.375,1.703125,3.40625,2.564453,0.76709,1.426758,3.548828,1.751953,1.974609,1.091797,2.261719,0.463623,1.257812,2.724609,0.927246,1.515625,1.797852,0.0,1.0,2.486328,1.780273,-0.017761,0.990234,8,-1.047852,-1.314453,1.09082,2.255859,2.259766,-0.310059,49.71875,2.359375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505419,7163548,dsgdb9nsd_133872,14,15,2JHH,H,2.283203,-0.581055,0.627441,1.09375,0.154785,0.0,0.088623,-0.277588,0.182007,0.043671,0.220093,0.197388,0.056427,0.223022,0.213745,H,1.368164,-1.249023,2.0,1,1.095703,0.137329,0.0,0.088623,-0.224854,0.17395,0.01886,0.203857,0.175781,0.031982,0.208618,0.192383,0.753906,59.15625,16.921875,0.115051,0.113159,1.780273,HH,2J,1.780273,2.4375,2.726562,1.007812,1,1.007812,1,0.0,67,2.513672,1.088867,3.601562,7,2,-0.397949,0.851562,1.475586,2.724609,2.380859,0.601562,1.337891,3.412109,1.632812,1.916992,1.09375,2.208984,0.429932,1.241211,2.640625,0.859863,1.483398,1.780273,0.0,1.0,2.486328,1.780273,0.0,1.0,5,1.279297,-0.770996,1.09375,2.115234,2.115234,-0.344727,52.46875,4.289062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505467,7163576,dsgdb9nsd_133883,9,10,2JHH,H,-0.837402,2.376953,1.262695,1.092773,0.143799,0.0,0.068848,-0.16272,0.107483,0.016739,0.210449,0.182861,0.030548,0.215576,0.200439,H,-2.265625,1.405273,0.73877,1,1.092773,0.143799,0.0,0.068848,-0.16272,0.107483,0.016739,0.210449,0.182861,0.030548,0.215576,0.200439,0.819336,90.0,0.000239,0.112061,0.112061,1.805664,HH,2J,1.805664,2.818359,2.765625,1.007812,1,1.007812,1,0.0,67,2.59375,1.081055,3.625,7,1,0.562988,-0.841797,1.415039,0.010254,2.373047,0.567871,1.314453,3.625,1.820312,2.007812,1.092773,1.805664,0.0,1.0,1.805664,0.0,1.0,1.805664,0.0,1.0,2.365234,1.789062,-0.016479,0.990723,0,-1.181641,1.415039,1.092773,2.292969,2.296875,-0.300293,121.5,-0.005001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505487,7163628,dsgdb9nsd_133883,15,16,2JHH,H,1.160156,1.079102,-1.801758,1.092773,0.145996,0.0,0.082336,-0.157593,0.102356,0.020432,0.213989,0.185791,0.033844,0.219482,0.203491,H,-0.254639,0.115173,-2.320312,1,1.092773,0.145996,0.0,0.082336,-0.157593,0.102356,0.020432,0.213989,0.185791,0.033844,0.219482,0.203491,0.819336,90.0,0.000749,0.115356,0.115356,1.789062,HH,2J,1.789062,2.398438,2.337891,1.007812,1,1.007812,1,0.0,67,2.59375,1.081055,3.625,9,1,-0.086609,-0.201782,1.415039,1.299805,2.595703,0.806641,1.451172,3.574219,1.78418,1.99707,1.092773,1.789062,0.0,1.0,1.789062,0.0,1.0,1.789062,0.0,1.0,2.365234,1.789062,0.0,1.0,7,0.402344,0.359619,1.092773,2.191406,2.191406,-0.326172,121.5,-0.005001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2505521,7163642,dsgdb9nsd_133885,9,10,2JHH,H,-2.089844,1.327148,1.263672,1.091797,0.160278,0.0,0.078918,-0.380859,0.282471,0.040009,0.223999,0.202515,0.053802,0.226807,0.21875,H,-1.44043,2.287109,-0.127563,1,1.091797,0.160278,0.0,0.078918,-0.380859,0.282471,0.040009,0.223999,0.202515,0.053802,0.226807,0.21875,0.675781,0.000214,3.4e-05,0.123779,0.123779,1.811523,HH,2J,1.811523,2.779297,2.705078,1.007812,1,1.007812,1,0.0,52,2.498047,1.080078,3.423828,6,1,0.667969,-1.619141,2.287109,0.0,2.171875,0.359619,1.198242,3.142578,1.331055,1.734375,1.091797,1.811523,0.0,1.0,1.811523,0.0,1.0,1.811523,0.0,1.0,2.589844,1.811523,0.0,1.0,0,-1.410156,1.336914,1.09082,2.335938,2.335938,-0.288574,-0.005001,-0.005001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Predict test

In [150]:
y_pred_test_1JHC = xgb_model_1JHC.predict(test_1JHC_encoded)
y_pred_test_1JHN = xgb_model_1JHN.predict(test_1JHN_encoded)
y_pred_test_2JHH = xgb_model_2JHH.predict(test_2JHH_encoded)
y_pred_test_2JHN = xgb_model_2JHN.predict(test_2JHN_encoded)
y_pred_test_2JHC = xgb_model_2JHC.predict(test_2JHC_encoded)
y_pred_test_3JHH = xgb_model_3JHH.predict(test_3JHH_encoded)
y_pred_test_3JHN = xgb_model_3JHN.predict(test_3JHN_encoded)
y_pred_test_3JHC = xgb_model_3JHC.predict(test_3JHC_encoded)

In [151]:
test_1JHC['preds'] = y_pred_test_1JHC
test_1JHN['preds'] = y_pred_test_1JHN
test_2JHH['preds'] = y_pred_test_2JHH
test_2JHN['preds'] = y_pred_test_2JHN
test_2JHC['preds'] = y_pred_test_2JHC
test_3JHH['preds'] = y_pred_test_3JHH
test_3JHC['preds'] = y_pred_test_3JHC
test_3JHN['preds'] = y_pred_test_3JHN

## Concat predicitions and submit

In [152]:
test_final = pd.concat([test_1JHC, test_1JHN, test_2JHH, test_2JHN,
                        test_2JHC, test_3JHH, test_3JHC, test_3JHN])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [153]:
test_final = test_final.sort_values(by=['id'])

In [154]:
submsn = pd.DataFrame(data={
    'id': test_final['id'],
    'scalar_coupling_constant': test_final['preds']
})

In [156]:
submsn.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,10.84255
2,4658148,169.181625
4,4658149,13.152798
1,4658150,172.655869
3,4658151,11.367233


In [157]:
submsn.to_csv('submissionscalar.csv', index=False)

In [158]:
from catboost import CatBoostRegressor, Pool, cv

In [159]:
cat_model_1JHC = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_1JHN = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_2JHH = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_2JHN = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_2JHC = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_3JHH = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_3JHN = CatBoostRegressor(task_type="GPU", devices='0:1')
cat_model_3JHC = CatBoostRegressor(task_type="GPU", devices='0:1')

In [160]:
cat_model_1JHC.fit(X_trainval_1JHC_encoded, y_1JHC, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a19860>

In [161]:
cat_model_1JHN.fit(X_trainval_1JHN_encoded, y_1JHN, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a198d0>

In [162]:
cat_model_2JHH.fit(X_trainval_2JHH_encoded, y_2JHH, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a199b0>

In [164]:
cat_model_2JHN.fit(X_trainval_2JHN_encoded, y_2JHN, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a199e8>

In [165]:
cat_model_2JHC.fit(X_trainval_2JHC_encoded, y_2JHC, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a19a20>

In [166]:
cat_model_3JHH.fit(X_trainval_3JHH_encoded, y_3JHH, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a19a58>

In [167]:
cat_model_3JHN.fit(X_trainval_3JHN_encoded, y_3JHN, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a19a90>

In [168]:
cat_model_3JHC.fit(X_trainval_3JHC_encoded, y_3JHC, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a88a19ac8>

In [169]:
cat_y_pred_test_1JHC = cat_model_1JHC.predict(test_1JHC_encoded)
cat_y_pred_test_1JHN = cat_model_1JHN.predict(test_1JHN_encoded)
cat_y_pred_test_2JHH = cat_model_2JHH.predict(test_2JHH_encoded)
cat_y_pred_test_2JHN = cat_model_2JHN.predict(test_2JHN_encoded)
cat_y_pred_test_2JHC = cat_model_2JHC.predict(test_2JHC_encoded)
cat_y_pred_test_3JHH = cat_model_3JHH.predict(test_3JHH_encoded)
cat_y_pred_test_3JHN = cat_model_3JHN.predict(test_3JHN_encoded)
cat_y_pred_test_3JHC = cat_model_3JHC.predict(test_3JHC_encoded)

In [171]:
test_1JHC['cat_preds'] = cat_y_pred_test_1JHC
test_1JHN['cat_preds'] = cat_y_pred_test_1JHN
test_2JHH['cat_preds'] = cat_y_pred_test_2JHH
test_2JHN['cat_preds'] = cat_y_pred_test_2JHN
test_2JHC['cat_preds'] = cat_y_pred_test_2JHC
test_3JHH['cat_preds'] = cat_y_pred_test_3JHH
test_3JHC['cat_preds'] = cat_y_pred_test_3JHC
test_3JHN['cat_preds'] = cat_y_pred_test_3JHN

In [172]:
cat_test_final = pd.concat([test_1JHC, test_1JHN, test_2JHH, test_2JHN,
                        test_2JHC, test_3JHH, test_3JHC, test_3JHN])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [173]:
cat_test_final = cat_test_final.sort_values(by=['id'])

In [174]:
cat_submsn = pd.DataFrame(data={
    'id': test_final['id'],
    'scalar_coupling_constant': test_final['preds']
})

In [175]:
cat_submsn.to_csv('cat_submissionscalar.csv', index=False)

In [None]:
# xgb.plot_importance(xgb_model_1JHN)
# plt.show()

In [None]:
xgb_model_2JHH = XGBRegressor(nthread=7)

In [None]:
xgb_model_2JHH.fit(X_train_2JHH_encoded, y_train_2JHH)

In [None]:
xgb.plot_importance(xgb_model_1JHN)
plt.show()

In [210]:
cat_model_2JHH_trn = CatBoostRegressor(task_type="GPU", devices='0:1')

In [211]:
cat_model_2JHH_trn.fit(X_train_2JHH_encoded, y_train_2JHH, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f8a1ab675c0>

In [212]:
cat_feats_imp = cat_model_2JHH_trn.get_feature_importance()

In [214]:
len(cat_feats_imp)

148

In [215]:
len(X_train_2JHH_encoded.columns)

148

In [None]:
categorical_features_indices =[0,1,3,4,5,6,7,8]

In [None]:
fea_imp = pd.DataFrame({'imp': cat_model_2JHH.feature_importances_, 'col': X_train_2JHH_encoded.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
# fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
fea_imp.plot(kind='barh', x='col', y='imp', legend=None)
plt.title('CatBoost - Feature Importance')
plt.ylabel('Features')
plt.xlabel('Importance');

In [None]:
# cat_feats_imp

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
xgb_2JHH_pred = xgb_model_2JHH.predict(X_val_2JHH_encoded)

In [None]:
# y_val_2JHH
meanSquaredError=mean_squared_error(y_val_2JHH, xgb_2JHH_pred)
print("MSE:", meanSquaredError)
rootMeanSquaredError = sqrt(meanSquaredError)
print("RMSE:", rootMeanSquaredError)

In [None]:
cat_2JHH_pred = cat_model_2JHH.predict(X_val_2JHH_encoded)

In [None]:
# y_val_2JHH
meanSquaredError=mean_squared_error(y_val_2JHH, cat_2JHH_pred)
print("MSE:", meanSquaredError)
rootMeanSquaredError = sqrt(meanSquaredError)
print("RMSE:", rootMeanSquaredError)

In [None]:
"""
predict test data with default params usning catboost
plot correlation plot and test models for data without correlated columns
select best features 
tune selected features (Gaussian, PCA?)
hyperparams optimization
"""

In [None]:
# feature_score = pd.DataFrame(list(zip(X_train_2JHH_encoded.dtypes.index, 
#                                       cat_model_2JHH.get_feature_importance(Pool(X_train_2JHH_encoded, 
#                                                                                  label=y_train_2JHH, 
#                                                                                  cat_features=categorical_features_indices)))),
#                              columns=['Feature','Score'])

# feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')