In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['champs-scalar-coupling', 'predmolprop-featureengineering-slow']


In [2]:
# import data
#train = pd.read_csv("../input/champs-scalar-coupling/train.csv")
#test = pd.read_csv("../input/champs-scalar-coupling/test.csv")
structures = pd.read_csv("../input/champs-scalar-coupling/structures.csv")
train_extend = pd.read_csv("../input/predmolprop-featureengineering-slow/train_extend.csv")
test_extend = pd.read_csv("../input/predmolprop-featureengineering-slow/test_extend.csv")

In [3]:
train_extend.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'num_bonds', 'atom_end_type',
       'num_mol_bonds', 'min_d', 'mean_d', 'max_d', 'space_dr', 'bond_dr',
       'bond_1', 'bond_2', 'bond_3', 'atom_0_pc', 'atom_end_pc', 'atom_0_fc',
       'atom_end_fc', 'atom_0_val', 'atom_end_val', 'atom_0_sm', 'atom_end_sm',
       'atom_0_type2', 'atom_2_type', 'atom_3_type', 'atom_end_type2',
       'atom_2_hyb', 'atom_3_hyb', 'atom_end_hyb', 'path_count', 'atom_0_min',
       'atom_0_mean', 'atom_0_max', 'atom_0_Cmin', 'atom_0_Cmean',
       'atom_0_Cmax', 'atom_0_Omin', 'atom_0_Omean', 'atom_0_Omax',
       'atom_0_Nmin', 'atom_0_Nmean', 'atom_0_Nmax', 'atom_0_Fmin',
       'atom_0_Fmean', 'atom_0_Fmax', 'atom_end_min', 'atom_end_mean',
       'atom_end_max', 'atom_end_Cmin', 'atom_end_Cmean', 'atom_end_Cmax',
       'atom_end_Omin', 'atom_end_Omean', 'atom_end_Omax', 'atom_end_Nmin',
       'atom_end_Nmean', 'atom_end_Nmax', 'atom_end_Fmin'

In [4]:
# there are 3 bond couplings with no defined dihedral angle because they are in linear molecules
# define the dihedral angle as -1 and make indicator column
def FindNan(x):
    if str(x)=='nan':
        return 1
    else:
        return 0

def ReplaceNan(x):
    if str(x)=='nan':
        return -2
    else:
        return x

train_extend['is_linear']=train_extend.bond3_angle.map(FindNan)
train_extend['bond3_angle']=train_extend.bond3_angle.map(ReplaceNan)

test_extend['is_linear']=test_extend.bond3_angle.map(FindNan)
test_extend['bond3_angle']=test_extend.bond3_angle.map(ReplaceNan)

Count the atoms of each type

In [5]:
def CleanColsAndIdx(df):
    df.columns = df.columns.set_names(None)
    df.index = df.index.set_names(None)
    return df

In [6]:
# Group by molecule_name and atom, then use size() to count how many of each atom
# Then use unstack to make the atom types, which groupby made into indices, into columns
# Finally, because not all molecule contain all five atoms, fill NAs with 0
AtomTypes=structures.groupby(['molecule_name','atom'],sort=False).size().unstack('atom').fillna(0)
# remove index names (optional)

# rename columns
AtomTypes = AtomTypes.rename(columns={'C':'num_C','H':'num_H','N':'num_N',
                                                  'O':'num_O','F':'num_F'})

AtomTypes=CleanColsAndIdx(AtomTypes)

AtomTypes=AtomTypes.astype('int8')

Measurements from the Center of Mass (COM)

In [7]:
gb=structures.groupby('molecule_name',sort=False)
NumAtoms=pd.DataFrame(gb.size(),columns=['total_atoms'])

COM=gb.mean()
COM.pop('atom_index')
COM = COM.rename(columns={'x':'COM_x','y':'COM_y','z':'COM_z'})
COM = CleanColsAndIdx(COM)
structures = pd.merge(structures,COM,how='left',left_on='molecule_name',right_index=True)

axis = ['x','y','z']
for ax in axis:
    structures['d'+ax]=structures[ax]-structures['COM_'+ax]

structures=structures.assign(COM_dr=lambda x: np.sqrt(x.dx**2+x.dy**2+x.dz**2))

gb=structures[['molecule_name','COM_dr']].groupby('molecule_name',sort=False)
Dmin_COM=gb.min(); Dmin_COM.columns=['Dmin_COM']
Dmin_COM=CleanColsAndIdx(Dmin_COM)
Dmean_COM=gb.mean(); Dmean_COM.columns=['Dmean_COM']
Dmean_COM=CleanColsAndIdx(Dmean_COM)
Dmax_COM=gb.max(); Dmax_COM.columns=['Dmax_COM']
Dmax_COM=CleanColsAndIdx(Dmax_COM)

In [8]:
poplist = ['COM_x','COM_y','COM_z','dx','dy','dz']#,'x','y','z']
for col in poplist:
    structures.pop(col)

structures.head(20)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,COM_dr
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,1.2e-05
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,1.091945
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,1.091946
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,1.091954
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,1.091954
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,0.301261
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,0.939879
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,0.939876
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,0.939962
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,0.396141


In [9]:
MolProps = AtomTypes.copy()
MolProps=MolProps.join([NumAtoms,Dmin_COM,Dmean_COM,Dmax_COM])
del AtomTypes,NumAtoms,COM,Dmin_COM,Dmean_COM,Dmax_COM
MolProps.head(20)

Unnamed: 0,num_C,num_H,num_N,num_O,num_F,total_atoms,Dmin_COM,Dmean_COM,Dmax_COM
dsgdb9nsd_000001,1,4,0,0,0,5,1.2e-05,0.873562,1.091954
dsgdb9nsd_000002,0,3,1,0,0,4,0.301261,0.780244,0.939962
dsgdb9nsd_000003,0,2,0,1,0,3,0.396141,0.653496,0.782173
dsgdb9nsd_000004,2,2,0,0,0,4,0.599539,1.130589,1.661639
dsgdb9nsd_000005,1,1,1,0,0,3,0.028383,0.74891,1.123365
dsgdb9nsd_000007,2,6,0,0,0,8,0.764813,1.352176,1.547991
dsgdb9nsd_000008,1,4,0,1,0,6,0.35495,1.136816,1.586039
dsgdb9nsd_000009,3,4,0,0,0,7,0.506195,1.571041,2.769087
dsgdb9nsd_000010,2,3,1,0,0,6,0.488208,1.268453,2.12373
dsgdb9nsd_000011,2,4,0,1,0,7,0.650007,1.337038,1.841725


In [10]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop(['atom_index','atom','x','y','z'], axis=1)
    df = df.rename(columns={'COM_dr': f'COM_dr_{atom_idx}'})
    return df

train_extend = map_atom_info(train_extend, 0)
train_extend = map_atom_info(train_extend, 1)

test_extend = map_atom_info(test_extend, 0)
test_extend = map_atom_info(test_extend, 1)

In [11]:
def map_mol_info(df):
    df = pd.merge(df, MolProps, how = 'left',
                  left_on  = ['molecule_name'],
                  right_index = True)
    return df

train_extend = map_mol_info(train_extend)
test_extend = map_mol_info(test_extend)

In [12]:
train_extend.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'num_bonds', 'atom_end_type',
       'num_mol_bonds', 'min_d', 'mean_d', 'max_d', 'space_dr', 'bond_dr',
       'bond_1', 'bond_2', 'bond_3', 'atom_0_pc', 'atom_end_pc', 'atom_0_fc',
       'atom_end_fc', 'atom_0_val', 'atom_end_val', 'atom_0_sm', 'atom_end_sm',
       'atom_0_type2', 'atom_2_type', 'atom_3_type', 'atom_end_type2',
       'atom_2_hyb', 'atom_3_hyb', 'atom_end_hyb', 'path_count', 'atom_0_min',
       'atom_0_mean', 'atom_0_max', 'atom_0_Cmin', 'atom_0_Cmean',
       'atom_0_Cmax', 'atom_0_Omin', 'atom_0_Omean', 'atom_0_Omax',
       'atom_0_Nmin', 'atom_0_Nmean', 'atom_0_Nmax', 'atom_0_Fmin',
       'atom_0_Fmean', 'atom_0_Fmax', 'atom_end_min', 'atom_end_mean',
       'atom_end_max', 'atom_end_Cmin', 'atom_end_Cmean', 'atom_end_Cmax',
       'atom_end_Omin', 'atom_end_Omean', 'atom_end_Omax', 'atom_end_Nmin',
       'atom_end_Nmean', 'atom_end_Nmax', 'atom_end_Fmin'

In [13]:
len(train_extend.columns)

77

In [14]:
test_extend.atom_2_type.unique()

array(['C1', nan, 'C3', 'Nam', 'O3', 'C2', 'N3', 'Car', 'O2', 'Nar',
       'Npl', 'N2', 'Ng+', 'N3+'], dtype=object)

In [15]:
train_extend.atom_3_type.unique()

array([nan, 'C3', 'O3', 'C1', 'C2', 'N2', 'Car', 'Nar', 'O2', 'N3', 'Nam',
       'Npl', 'C+', 'N3+', 'Nox', 'Ng+'], dtype=object)

In [16]:
# check that the categorical features are the same
print(len(set(train_extend.atom_2_type.unique())-set(test_extend.atom_2_type.unique())))
print(len(set(test_extend.atom_3_type.unique())-set(train_extend.atom_3_type.unique())))

0
0


In [17]:
train_extend[train_extend.columns[6:26]].describe()

Unnamed: 0,num_bonds,num_mol_bonds,min_d,mean_d,max_d,space_dr,bond_dr,bond_1,bond_2,bond_3,atom_0_pc,atom_end_pc,atom_0_fc,atom_end_fc,atom_0_val,atom_end_val,atom_0_sm,atom_end_sm
count,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0
mean,2.325156,19.75888,1.029168,3.255689,6.589293,2.360922,2.987136,1.0,0.7078817,0.006478327,0.05058974,0.01225711,0.0,0.0,1.0,3.121726,0.0,0.01211061
std,0.7366406,2.875716,0.05350769,0.2638297,0.9869911,0.7367096,1.083377,0.0,0.7746804,1.053283,0.04177597,0.1057742,0.0,0.0,0.0,1.202994,0.0,0.1551868
min,1.0,2.0,0.9586066,1.145857,1.513358,1.002241,1.002241,1.0,-1.0,-1.0,-0.09007463,-0.4188789,0.0,0.0,1.0,1.0,0.0,0.0
25%,2.0,18.0,0.9646925,3.080365,5.868474,1.948526,2.36386,1.0,1.0,-1.0,0.02721544,-0.03294718,0.0,0.0,1.0,2.0,0.0,0.0
50%,2.0,20.0,1.018438,3.222618,6.450707,2.313485,2.65714,1.0,1.0,-1.0,0.03420534,0.02626475,0.0,0.0,1.0,4.0,0.0,0.0
75%,3.0,22.0,1.084557,3.390371,7.113688,2.946476,4.041835,1.0,1.0,1.0,0.0575324,0.06224376,0.0,0.0,1.0,4.0,0.0,0.0
max,3.0,28.0,1.111452,5.345653,12.04039,3.924354,4.448899,1.0,3.0,3.0,0.3640839,0.5500642,0.0,0.0,1.0,4.0,0.0,3.0


In [18]:
train_extend[train_extend.columns[26:46]].describe()

Unnamed: 0,atom_2_hyb,atom_3_hyb,atom_end_hyb,path_count,atom_0_min,atom_0_mean,atom_0_max,atom_0_Cmin,atom_0_Cmean,atom_0_Cmax,atom_0_Omin,atom_0_Omean,atom_0_Omax,atom_0_Nmin,atom_0_Nmean,atom_0_Nmax,atom_0_Fmin
count,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0
mean,2.271297,0.8566037,2.174139,1.03438,1.086151,3.448149,5.583775,1.156816,3.000913,4.603247,2.495728,2.802087,3.110704,1.207891,1.398023,1.582726,-0.9686235
std,1.46455,1.9295,1.201639,0.1895493,0.02592215,0.4241759,0.9728497,0.2367558,0.4678379,0.9935855,1.947983,2.017329,2.213152,2.218438,2.34682,2.548368,0.4059599
min,-1.0,-1.0,0.0,1.0,0.9586066,1.237732,1.513358,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,3.0,-1.0,2.0,1.0,1.08762,3.153603,4.921539,1.090556,2.674816,3.956129,2.055108,2.333356,2.457948,-1.0,-1.0,-1.0,-1.0
50%,3.0,-1.0,3.0,1.0,1.093561,3.37002,5.422875,1.094228,2.913418,4.454278,2.71827,3.26765,3.60133,1.021719,2.11415,2.132134,-1.0
75%,3.0,3.0,3.0,1.0,1.095879,3.651209,6.146281,1.096858,3.232154,5.184237,3.801027,4.079756,4.540916,3.039998,3.403922,3.815579,-1.0
max,3.0,3.0,3.0,3.0,1.247942,7.585518,12.04039,3.496963,6.523529,11.15916,10.92534,10.92534,10.92534,10.75381,10.75381,10.75381,7.805846


In [19]:
train_extend[train_extend.columns[46:]].describe()

Unnamed: 0,atom_0_Fmean,atom_0_Fmax,atom_end_min,atom_end_mean,atom_end_max,atom_end_Cmin,atom_end_Cmean,atom_end_Cmax,atom_end_Omin,atom_end_Omean,...,COM_dr_1,num_C,num_H,num_N,num_O,num_F,total_atoms,Dmin_COM,Dmean_COM,Dmax_COM
count,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,...,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0
mean,-0.9673718,-0.9660134,1.142591,2.954799,4.936891,1.432461,2.623027,3.946114,2.049699,2.336697,...,1.805007,6.719829,10.21538,0.8213094,1.276229,0.01056364,19.04331,0.7722112,2.290601,3.620758
std,0.4197352,0.43636,0.1209999,0.4779996,1.056066,0.2299194,0.5494215,1.086306,1.756172,1.822792,...,0.7688348,1.140853,2.673188,0.9516164,0.8500538,0.1513627,2.77399,0.2800536,0.1999257,0.5718548
min,-1.0,-1.0,0.9586066,1.017195,1.017208,-1.0,-1.0,-1.0,-1.0,-1.0,...,5.883444e-06,0.0,1.0,0.0,0.0,0.0,3.0,5.883444e-06,0.6534956,0.7821732
25%,-1.0,-1.0,1.089709,2.598649,4.201412,1.336561,2.215969,3.151457,1.401545,1.765791,...,1.254214,6.0,8.0,0.0,1.0,0.0,17.0,0.5825748,2.158632,3.2
50%,-1.0,-1.0,1.094119,2.853414,4.734151,1.501337,2.528806,3.803336,2.388518,2.628668,...,1.733884,7.0,10.0,1.0,1.0,0.0,19.0,0.7652321,2.266137,3.520592
75%,-1.0,-1.0,1.100299,3.218238,5.574344,1.52912,2.927891,4.601007,3.151907,3.484171,...,2.302506,7.0,12.0,1.0,2.0,0.0,21.0,0.9629849,2.391795,3.955266
max,7.805846,8.341639,1.568057,6.885714,12.04039,3.976531,6.523529,11.15916,10.47548,10.47548,...,6.389156,9.0,20.0,7.0,5.0,6.0,29.0,1.897077,3.941514,7.130415


In [20]:
from collections import namedtuple

SpinProp = namedtuple('SpinProp',['mu','spin','NMR'] )
"""
https://en.wikipedia.org/wiki/Nuclear_magnetic_moment

mu: magnetic dipole moment
spin: nuclear spin number
NMR: NMR senstiivity relative to H
"""
AtomicSpinProp = {
    'H': SpinProp(2.79284734, 0.5, 1),
    'C': SpinProp(0.7024118, 0.5, 0.016),
    'O': SpinProp(-1.89379,2.5,0.037),
    'N': SpinProp(0.40376100, 1, 0.001),
    'F': SpinProp(2.628868, 0.5, 0.83)
}

def AddSpinProp(df):
    df = df.assign(mu= lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].mu),
             spin= lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].spin),
             NMR=lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].NMR))
    return df

In [21]:
#train = AddSpinProp(train)
#test=AddSpinProp(test)
#PrintDataframe(train.head())

In [22]:
#MolProps.to_csv('MolecularProperties.csv')
#structures.to_csv('structures_extended.csv', index=False)
train_extend.to_csv('train_extend.csv', index=False)
#test_extend.to_csv('test_extend.csv', index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [23]:
len(train_extend.bond_2.unique())*len(train_extend.bond_3.unique())

16

In [24]:
len(train_extend.atom_0_type2.unique())*len(train_extend.atom_end_type2.unique())*len(train_extend.atom_2_type.unique())

504

In [25]:
len(train_extend.atom_0_type2.unique())*len(train_extend.atom_end_type2.unique())*len(train_extend.atom_2_type.unique())*len(train_extend.atom_3_type.unique())

8064

In [26]:
len(train_extend)

4658147