In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max.rows', 150)
pd.set_option('display.max.columns', 150)

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial
from tqdm import tqdm_notebook as tqdm

from category_encoders import OrdinalEncoder, OneHotEncoder
# import eli5
# from eli5.sklearn import PermutationImportance

from scipy.stats import randint, uniform

In [2]:
trainval = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
structures = pd.read_csv('./data/structures.csv')
dipole = pd.read_csv('./data/dipole_moments.csv')
contrib = pd.read_csv('./data/scalar_coupling_contributions.csv')
magnetic = pd.read_csv('./data/magnetic_shielding_tensors.csv')
mulliken = pd.read_csv('./data/mulliken_charges.csv')
potential_energy = pd.read_csv('./data/potential_energy.csv')
# test_mulliken = pd.read_csv('mulliken_charges_test_set.csv')

## Reduce Memory Function

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    This function reduces the numeric to the least possible numeric type that fits the data so 
    memory usage during transforming and training will be reduced.
    Taken from: https://www.kaggle.com/todnewman/keras-neural-net-for-champs
    
    Han
    Parameters:
    ===========
    dataframe: input dataframe 
    verbose: verbose mode, default True.
    Output:
    ===========
    dataframe: dataframe with numeric columns types changed to the least possible size
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Create features on structures

In [4]:
# %%time
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

fudge_factor = 0.05
atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
print(atomic_radius)

electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

print('Calculating bonds')

for i in tqdm(range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

print('Counting and condensing bonds')

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)
display(structures.head(20))

{'H': 0.43, 'C': 0.8200000000000001, 'N': 0.8, 'O': 0.78, 'F': 0.76}


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))


Calculating bonds


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


Counting and condensing bonds


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,n_bonds,bond_lengths_mean
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,3,1.017195
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,1,1.01719
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,1,1.017187
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,1,1.017208
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,2,0.962107


## Merge structures function

In [5]:
def merge_train_structures(train, structures, dip_mom, pot_energ):
  
    """This function is used to merge the structures dataset to the 
     original train dataset

     Parameters:
     ===========
       train: train dataframe
       structures: structures dataframe.

     Output:
       ===========
       dataframe: merged dataframe
     """

    structures = structures.rename({'atom_index': 'atom_index_0',
                                    'x':'x_0', 'y':'y_0', 'z':'z_0',
                                    'atom':'atom_0'}, axis=1)

    merged = pd.merge(train, structures, on=['molecule_name', 'atom_index_0'])

    structures = structures.rename({'atom_index_0': 'atom_index_1',
                                  'x_0':'x_1', 'y_0':'y_1', 'z_0':'z_1',
                                  'atom_0':'atom_1'}, axis=1)

    merged_1 = pd.merge(merged, structures, on=['molecule_name', 'atom_index_1'])

    structures = structures.rename({'atom_index_1': 'atom_index',
                                  'x_1':'x', 'y_1':'y', 'z_1':'z',
                                  'atom_1':'atom'}, axis=1)
    
    dip_mom = dip_mom.rename({'X':'X_dipm', 'Y':'Y_dipm', 'Z':'Z_dipm'}, axis=1)

    merged_2 = pd.merge(merged_1, dip_mom, on=['molecule_name',])
    
    merged_3 = pd.merge(merged_2, pot_energ, on=['molecule_name',])

    assert train.shape[0] == merged.shape[0]

    return merged_3

In [6]:
merged = merge_train_structures(trainval, structures, dipole, potential_energy)

In [7]:
assert len(merged) == len(trainval)

In [8]:
merged.head(5)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,X_dipm,Y_dipm,Z_dipm,potential_energy
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0,0.0,0.0,-40.52368


In [None]:
# df['Total charge'] = df['Total day charge'] + df['Total eve charge'] + \
#                      df['Total night charge'] + df['Total intl charge']

In [9]:
merged['enrg_per_bnd'] = merged['potential_energy'] / (merged['n_bonds_x'] + merged['n_bonds_y'])

In [10]:
merged.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,X_dipm,Y_dipm,Z_dipm,potential_energy,enrg_per_bnd
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0,0.0,0.0,-40.52368,-20.26184
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,0.0,0.0,0.0,-40.52368,-20.26184
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,0.0,0.0,0.0,-40.52368,-20.26184
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,0.0,0.0,0.0,-40.52368,-20.26184
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,0.0,0.0,0.0,-40.52368,-20.26184
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,0.0,0.0,0.0,-40.52368,-20.26184


In [11]:
mulliken = mulliken.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
merged = merged.merge(mulliken, on=['molecule_name', 'atom_index_0'])

mulliken = mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
merged = merged.merge(mulliken, on=['molecule_name', 'atom_index_1'])
assert len(merged) == len(trainval)

## Feature Engineering

In [12]:
%%time
# This block is SPPED UP

train_p_0 = merged[['x_0', 'y_0', 'z_0']].values
train_p_1 = merged[['x_1', 'y_1', 'z_1']].values
# test_p_0 = test[['x_0', 'y_0', 'z_0']].values
# test_p_1 = test[['x_1', 'y_1', 'z_1']].values

merged['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
# test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
assert len(merged) == len(trainval)

CPU times: user 1.79 s, sys: 1.55 s, total: 3.34 s
Wall time: 1.63 s


In [13]:
merged.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,X_dipm,Y_dipm,Z_dipm,potential_energy,enrg_per_bnd,mulliken_charge_0,mulliken_charge_1,dist
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736,0.133921,-0.535689,1.091953
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736,0.133922,-0.535689,1.091952
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,2.2,0.43,1,1.091946,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736,0.133923,-0.535689,1.091946
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,2.2,0.43,1,1.091948,C,-0.012698,1.085804,0.008001,2.55,0.82,4,1.09195,0.0,0.0,0.0,-40.52368,-8.104736,0.133923,-0.535689,1.091948
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,2.2,0.43,1,1.091953,H,1.011731,1.463751,0.000277,2.2,0.43,1,1.091952,0.0,0.0,0.0,-40.52368,-20.26184,0.133921,0.133922,1.78312


In [14]:
%%time

external_data_0 = pd.DataFrame(data={
    'atom_0': ['C', 'H', 'N', 'O', 'F'],
    'atom_0_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_0': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_0': [4, 1, 5, 6, 7]
})
external_data_1 = pd.DataFrame(data={
    'atom_1': ['C', 'H', 'N', 'O', 'F'],
    'atom_1_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_1': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_1': [4, 1, 5, 6, 7]
})

def create_features(df):
    df['bond'] = df['type'].str[2:]
    df['j_type'] = df['type'].str[:2]
    df['distance'] = ((df['x_1'].values - df['x_0'].values)**2 + 
                    (df['y_1'].values - df['y_0'].values)**2 + 
                    (df['z_1'].values - df['z_0'].values)**2)** 0.5
    df['mu_0'] = np.sqrt(df['x_0'].values**2 + df['y_0'].values**2 + df['z_0'].values**2)
    df['mu_1'] = np.sqrt(df['x_1'].values**2 + df['y_1'].values**2 + df['z_1'].values**2)
    df = df.merge(external_data_0, on='atom_0', how='left')
    df = df.merge(external_data_1, on='atom_1', how='left')
    df['delta_en'] = (df['atom_0_en'] - df['atom_1_en']).abs()
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df['molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df['molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df['molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df['molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df['molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df['molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df['molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df['molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df['molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df['molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df['molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df['molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df['molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df['molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df['molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df['molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df['molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df['molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df['molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df['molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df['molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df['molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df['molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df['molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df['molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df['molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df['molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df['molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df['molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df['molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df['molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df['molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df['molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df['molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df['molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df['molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df['molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df['molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
#     df['molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
#     df['molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
#     df['molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
#     df['molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
#     df['molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
#     df['molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
#     df['molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
#     df['molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
#     df['molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    df = reduce_mem_usage(df)
    return df

CPU times: user 1.76 ms, sys: 0 ns, total: 1.76 ms
Wall time: 1.53 ms


In [16]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df

def create_closest(df_train):
    #I apologize for my poor coding skill. Please make the better one.
    df_temp=df_train.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
#     df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    for atom_idx in [0,1]:
        df_train = map_atom_info(df_train,df_temp, atom_idx)
        df_train = df_train.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'distance_closest': f'distance_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})
    df_train = reduce_mem_usage(df_train)
    return df_train


In [17]:
%%time
merged = create_features(merged)
assert len(merged) == len(trainval)

Mem. usage decreased to 959.55 Mb (69.0% reduction)
CPU times: user 1min 27s, sys: 1min 49s, total: 3min 16s
Wall time: 1min 5s


In [18]:
%%time
dtrain = create_closest(merged)

Mem. usage decreased to 1043.95 Mb (2.5% reduction)
CPU times: user 1min 17s, sys: 1min 14s, total: 2min 31s
Wall time: 51.4 s


In [19]:
assert len(dtrain) == len(trainval)

In [20]:
def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    df = reduce_mem_usage(df)
    return df
    
merged = add_cos_features(dtrain)

Mem. usage decreased to 1088.38 Mb (0.0% reduction)


In [21]:
assert len(merged) == len(trainval)
merged.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,X_dipm,Y_dipm,Z_dipm,potential_energy,enrg_per_bnd,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atom_0_en,atomic_mass_0,valence_electrons_0,atom_1_en,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,atom_index_closest_0,dist_y,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,0.006702,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.779297,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.776367,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.77832,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,0.0,1.0,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0.0,0.0,-40.53125,-20.265625,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.779297,2.199219,1.007812,1,2.199219,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,0,1.091797,-0.012695,1.085938,0.008003,0.0,1.091797,-0.012695,1.085938,0.008003,1.091797,1.091797,-0.333496,-1.333008,1.333984


In [22]:
merged['dip_cop_0'] = (3*(merged['cos_0']**2)-1) / 2
merged['dip_cop_1'] = (3*(merged['cos_1']**2)-1) / 2
merged['dip_cop_0_1'] = (3*(merged['cos_0_1']**2)-1) / 2

In [23]:
merged.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,EN_x,rad_x,n_bonds_x,bond_lengths_mean_x,atom_1,x_1,y_1,z_1,EN_y,rad_y,n_bonds_y,bond_lengths_mean_y,X_dipm,Y_dipm,Z_dipm,potential_energy,enrg_per_bnd,mulliken_charge_0,mulliken_charge_1,dist_x,bond,j_type,distance,mu_0,mu_1,atom_0_en,atomic_mass_0,valence_electrons_0,atom_1_en,atomic_mass_1,valence_electrons_1,delta_en,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,atom_index_closest_0,dist_y,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1,dip_cop_0,dip_cop_1,dip_cop_0_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,0.006702,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,,,,
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.779297,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,,,,
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,H,-0.541016,1.447266,-0.876465,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.776367,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.0,1.0,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,,,,
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,H,-0.523926,1.4375,0.90625,2.199219,0.429932,1,1.091797,C,-0.012695,1.085938,0.008003,2.550781,0.819824,4,1.091797,0.0,0.0,0.0,-40.53125,-8.101562,0.133911,-0.535645,1.091797,HC,1J,1.091797,1.77832,1.085938,2.199219,1.007812,1,2.550781,12.007812,4,0.350098,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,0.0,1.0,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,0,1.091797,-0.012695,1.085938,0.008003,,,,,,1.091797,,,,,,,
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,H,0.00215,-0.006031,0.001976,2.199219,0.429932,1,1.091797,H,1.011719,1.463867,0.000277,2.199219,0.429932,1,1.091797,0.0,0.0,0.0,-40.53125,-20.265625,0.133911,0.133911,1.783203,HH,2J,1.783203,0.006702,1.779297,2.199219,1.007812,1,2.199219,1.007812,1,0.0,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,0,1.091797,-0.012695,1.085938,0.008003,0.0,1.091797,-0.012695,1.085938,0.008003,1.091797,1.091797,-0.333496,-1.333008,1.333984,2.166016,2.167969,-0.333252


## Scoring Function

In [24]:
groups = merged['type']

def group_lmae(y_true, y_pred, groups, floor=1e-9):
  maes = (y_true - y_pred).abs().groupby(groups).mean()
  return np.log(maes.map(lambda x: max(x, floor))).mean()

## Splitting

In [25]:
trainval_1JHC = merged[merged['type'] == '1JHC']
trainval_1JHN = merged[merged['type'] == '1JHN']
trainval_2JHH = merged[merged['type'] == '2JHH']
trainval_2JHN = merged[merged['type'] == '2JHN']
trainval_2JHC = merged[merged['type'] == '2JHC']
trainval_3JHH = merged[merged['type'] == '3JHH']
trainval_3JHC = merged[merged['type'] == '3JHC']
trainval_3JHN = merged[merged['type'] == '3JHN']

In [26]:
train_1JHC, val_1JHC = train_test_split(trainval_1JHC, shuffle=False, random_state=47)

train_1JHC_molecules = train_1JHC['molecule_name'].unique()
val_1JHC_molecules = np.delete(val_1JHC['molecule_name'].unique(), 0)

train_1JHC = train_1JHC[train_1JHC['molecule_name'].isin(train_1JHC_molecules)]
val_1JHC = val_1JHC[val_1JHC['molecule_name'].isin(val_1JHC_molecules)]

In [27]:
train_1JHN, val_1JHN = train_test_split(trainval_1JHN, shuffle=False, random_state=47)

train_1JHN_molecules = train_1JHN['molecule_name'].unique()
val_1JHN_molecules = np.delete(val_1JHN['molecule_name'].unique(), 0)

train_1JHN = train_1JHN[train_1JHN['molecule_name'].isin(train_1JHN_molecules)]
val_1JHN = val_1JHN[val_1JHN['molecule_name'].isin(val_1JHN_molecules)]

In [28]:
train_2JHH, val_2JHH = train_test_split(trainval_2JHH, shuffle=False, random_state=47)

train_2JHH_molecules = train_2JHH['molecule_name'].unique()
val_2JHH_molecules = np.delete(val_2JHH['molecule_name'].unique(), 0)

train_2JHH = train_2JHH[train_2JHH['molecule_name'].isin(train_2JHH_molecules)]
val_2JHH = val_2JHH[val_2JHH['molecule_name'].isin(val_2JHH_molecules)]

In [29]:
train_2JHN, val_2JHN = train_test_split(trainval_2JHN, shuffle=False, random_state=47)

train_2JHN_molecules = train_2JHN['molecule_name'].unique()
val_2JHN_molecules = np.delete(val_2JHN['molecule_name'].unique(), 0)

train_2JHN = train_2JHN[train_2JHN['molecule_name'].isin(train_2JHN_molecules)]
val_2JHN = val_2JHN[val_2JHN['molecule_name'].isin(val_2JHN_molecules)]

In [30]:
train_2JHC, val_2JHC = train_test_split(trainval_2JHC, shuffle=False, random_state=47)

train_2JHC_molecules = train_2JHC['molecule_name'].unique()
val_2JHC_molecules = np.delete(val_2JHC['molecule_name'].unique(), 0)

train_2JHC = train_2JHC[train_2JHC['molecule_name'].isin(train_2JHC_molecules)]
val_2JHC = val_2JHC[val_2JHC['molecule_name'].isin(val_2JHC_molecules)]

In [31]:
train_3JHH, val_3JHH = train_test_split(trainval_3JHH, shuffle=False, random_state=47)

train_3JHH_molecules = train_3JHH['molecule_name'].unique()
val_3JHH_molecules = np.delete(val_3JHH['molecule_name'].unique(), 0)

train_3JHH = train_3JHH[train_3JHH['molecule_name'].isin(train_3JHH_molecules)]
val_3JHH = val_3JHH[val_3JHH['molecule_name'].isin(val_3JHH_molecules)]

In [32]:
train_3JHC, val_3JHC = train_test_split(trainval_3JHC, shuffle=False, random_state=47)

train_3JHC_molecules = train_3JHC['molecule_name'].unique()
val_3JHC_molecules = np.delete(val_3JHC['molecule_name'].unique(), 0)

train_3JHC = train_3JHC[train_3JHC['molecule_name'].isin(train_3JHC_molecules)]
val_3JHC = val_3JHC[val_3JHC['molecule_name'].isin(val_3JHC_molecules)]

In [33]:
train_3JHN, val_3JHN = train_test_split(trainval_3JHN, shuffle=False, random_state=47)

train_3JHN_molecules = train_3JHN['molecule_name'].unique()
val_3JHN_molecules = np.delete(val_3JHN['molecule_name'].unique(), 0)

train_3JHN = train_3JHN[train_3JHN['molecule_name'].isin(train_3JHN_molecules)]
val_3JHN = val_3JHN[val_3JHN['molecule_name'].isin(val_3JHN_molecules)]

## X_train, y_train, X_val, y_val