In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings("ignore")
import math
pd.options.display.precision = 15

In [2]:
#!conda install -y -c openbabel openbabel 
import openbabel

In [3]:
file_folder = '../data'
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

x = structures.groupby('molecule_name').atom_index.max().reset_index(drop=False)
x.columns = ['molecule_name','totalatoms']
x.totalatoms+=1

train = train.merge(x,on='molecule_name')
test = test.merge(x,on='molecule_name')
#train = train[train.molecule_name=='dsgdb9nsd_000001']

In [4]:
obConversion = openbabel.OBConversion()
obConversion.SetInFormat("xyz")
structdir='../data/structures/'
mols=[]
mols_files=os.listdir(structdir)
mols_index=dict(map(reversed,enumerate(mols_files)))
for f in mols_index.keys():
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, structdir+f) 
    mols.append(mol)

In [5]:
stats_train = []

for m,groupdf in tqdm(train.groupby('molecule_name')):
    mol=mols[mols_index[m+'.xyz']]
    for i in groupdf.index.values:
        totalatoms = groupdf.loc[i].totalatoms
        firstatomid = int(groupdf.loc[i].atom_index_0)
        secondatomid = int(groupdf.loc[i].atom_index_1)
        entrystats = {}
        entrystats['totalatoms'] = totalatoms
        #entrystats['scalar_coupling_constant'] = float(groupdf.loc[i].scalar_coupling_constant)
        entrystats['type'] = groupdf.loc[i]['type']
        a = mol.GetAtomById(firstatomid)
        b = mol.GetAtomById(secondatomid)
        entrystats['molecule_name'] = m
        entrystats['atom_index_0'] = firstatomid
        entrystats['atom_index_1'] = secondatomid
        entrystats['bond_distance'] = a.GetDistance(b)
        entrystats['bond_atom'] = b.GetType()

        #Put the tertiary data in order of distance from first hydrogen
        tertiarystats = {}
        for j,c in enumerate(list(set(range(totalatoms)).difference(set([firstatomid,secondatomid])))):
            tertiaryatom = mol.GetAtomById(c)
            tp = tertiaryatom.GetType()
            dist = a.GetDistance(tertiaryatom)
            ang = a.GetAngle(b,tertiaryatom)*math.pi/180
            while(dist in tertiarystats):
                dist += 1e-15
                # print('Duplicates!',m,j,dist)
            tertiarystats[dist] = [tp,dist,ang]
        
        for k, c in enumerate(sorted(tertiarystats.keys())):
            entrystats['tertiary_atom_'+str(k)] = tertiarystats[c][0]
            entrystats['tertiary_distance_'+str(k)] = tertiarystats[c][1]
            entrystats['tertiary_angle_'+str(k)] = tertiarystats[c][2]
        stats_train.append(entrystats)

 83%|████████▎ | 70793/85003 [1:03:02<15:46, 15.01it/s]

KeyboardInterrupt: 

In [None]:
obtrain = pd.DataFrame(stats_test)

In [None]:
obtrain.head(5)

In [None]:
obtrain.to_pickle('../data/external_data/angles_distances_train.pkl')

In [None]:
obtrain.to_csv('../data/external_data/angles_distances_train.csv')

In [5]:
stats_test = []
for m,groupdf in tqdm(test.groupby('molecule_name')):
    mol=mols[mols_index[m+'.xyz']]
    for i in groupdf.index.values:
        totalatoms = groupdf.loc[i].totalatoms
        firstatomid = int(groupdf.loc[i].atom_index_0)
        secondatomid = int(groupdf.loc[i].atom_index_1)
        entrystats = {}
        entrystats['totalatoms'] = totalatoms
        #entrystats['scalar_coupling_constant'] = float(groupdf.loc[i].scalar_coupling_constant)
        entrystats['type'] = groupdf.loc[i]['type']
        a = mol.GetAtomById(firstatomid)
        b = mol.GetAtomById(secondatomid)
        entrystats['molecule_name'] = m
        entrystats['atom_index_0'] = firstatomid
        entrystats['atom_index_1'] = secondatomid
        entrystats['bond_distance'] = a.GetDistance(b)
        entrystats['bond_atom'] = b.GetType()

        #Put the tertiary data in order of distance from first hydrogen
        tertiarystats = {}
        for j,c in enumerate(list(set(range(totalatoms)).difference(set([firstatomid,secondatomid])))):
            tertiaryatom = mol.GetAtomById(c)
            tp = tertiaryatom.GetType()
            dist = a.GetDistance(tertiaryatom)
            ang = a.GetAngle(b,tertiaryatom)*math.pi/180
            while(dist in tertiarystats):
                dist += 1e-15
                # print('Duplicates!',m,j,dist)
            tertiarystats[dist] = [tp,dist,ang]
        
        for k, c in enumerate(sorted(tertiarystats.keys())):
            entrystats['tertiary_atom_'+str(k)] = tertiarystats[c][0]
            entrystats['tertiary_distance_'+str(k)] = tertiarystats[c][1]
            entrystats['tertiary_angle_'+str(k)] = tertiarystats[c][2]
        stats_test.append(entrystats)

100%|████████████████████████████████████████████████████████████████████████████| 45772/45772 [39:44<00:00, 12.28it/s]


In [6]:
obtest = pd.DataFrame(stats_test)

In [7]:
obtest.head(5)

Unnamed: 0,atom_index_0,atom_index_1,bond_atom,bond_distance,molecule_name,tertiary_angle_0,tertiary_angle_1,tertiary_angle_10,tertiary_angle_11,tertiary_angle_12,...,tertiary_distance_26,tertiary_distance_3,tertiary_distance_4,tertiary_distance_5,tertiary_distance_6,tertiary_distance_7,tertiary_distance_8,tertiary_distance_9,totalatoms,type
0,2,0,C1,2.2611780779,dsgdb9nsd_000004,0.000447213599109,3.141145439990684,,,,...,,,,,,,,,4,2JHC
1,2,1,C1,1.0620990943,dsgdb9nsd_000004,3.141145439990684,3.141145439990684,,,,...,,,,,,,,,4,1JHC
2,2,3,H,3.3232771722,dsgdb9nsd_000004,0.000447213599109,0.000447213599109,,,,...,,,,,,,,,4,3JHH
3,3,0,C1,1.0620990943,dsgdb9nsd_000004,3.141145439990684,3.141145439990684,,,,...,,,,,,,,,4,1JHC
4,3,1,C1,2.2611780779,dsgdb9nsd_000004,0.000447213599109,3.141145439990684,,,,...,,,,,,,,,4,2JHC


In [8]:
obtest.to_pickle('../data/external_data/angles_distances_test.pkl')

In [9]:
obtest.to_csv('../data/external_data/angles_distances_test.csv')