In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
stru_file_path = r'\\icnas4.cc.ic.ac.uk\fl4718\Desktop\Machine learning\Data\structures.csv'

structures = pd.read_csv(stru_file_path, dtype={'atom_index': np.int8})

In [3]:
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

fudge_factor = 0.05
atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
print(atomic_radius)

electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

display(structures.head())

{'H': 0.43, 'C': 0.8200000000000001, 'N': 0.8, 'O': 0.78, 'F': 0.76}


100%|██████████| 2358657/2358657 [00:00<00:00, 3032853.19it/s]
100%|██████████| 2358657/2358657 [00:00<00:00, 3053161.62it/s]


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43


In [4]:
i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

print('Calculating bonds')

for i in tqdm(range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

print('Counting and condensing bonds')

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
n_bonds = [len(x) for x in bonds_numeric]

#bond_data = {'bond_' + str(i):col for i, col in enumerate(np.transpose(bonds))}
#bond_data.update({'bonds_numeric':bonds_numeric, 'n_bonds':n_bonds})

bond_data = {'bonds':bonds_numeric, 'n_bonds':n_bonds, 'bond_lengths':bond_lengths}
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)
display(structures.head(20))

Calculating bonds


100%|██████████| 27/27 [00:12<00:00,  2.41it/s]


Counting and condensing bonds


100%|██████████| 2358657/2358657 [00:08<00:00, 266520.22it/s]
100%|██████████| 2358657/2358657 [00:13<00:00, 178940.50it/s]


Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,bonds,n_bonds,bond_lengths
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,"[1, 2, 3, 4]",4,"[1.091953, 1.0919516, 1.0919464, 1.0919476]"
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,[0],1,[1.091953]
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,[0],1,[1.0919516]
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,[0],1,[1.0919464]
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,[0],1,[1.0919476]
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,"[1, 2, 3]",3,"[1.01719, 1.0171872, 1.0172079]"
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,[0],1,[1.01719]
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,[0],1,[1.0171872]
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,[0],1,[1.0172079]
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,"[1, 2]",2,"[0.9621068, 0.9621068]"


In [None]:
hybri_dict = {'C': {'4': 3, '3': 2, '2': 1, '1': 0},
              'N': {'4': 0, '3': 3, '2': 2, '1': 1},
              'O': {'2': 2, '1': 1},
              'H': {'1': 0},
              'F': {'1': 0}}
              # 3 bonds- sp3, 2 - sp2, 1 - sp
    
hybri = np.zeros(len(structures))

'''
The situation that two atoms connecting to a C with one triple and one single bond is checked and there are 
no such situations in the data set, which is more straight for us to count the number of pi bonds for each molecule.
The number of pi bonds in each molecule will be directly related with the type of hybridization.
'''

for i in tqdm(range(len(structures))):
    hybri[i] = hybri_dict[structures.loc[i, 'atom']][str(structures.loc[i, 'n_bonds'])]
    if np.logical_and(structures.loc[i, 'atom'] == 'N', structures.loc[i, 'n_bonds'] == 1):
        hybri[i] = 1
        structures_sub = structures[structures['molecule_name'] == structures.loc[i, 'molecule_name']]['atom_index']
        index = structures_sub[structures_sub == structures.loc[i, 'bonds'][0]].index[0]
        hybri[index] = 1

  2%|▏         | 43738/2358657 [01:18<2:20:34, 274.46it/s]

In [19]:
structures['hybri'] = hybri

In [20]:
structures.head(20)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,bonds,n_bonds,bond_lengths,hybri
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,"[1, 2, 3, 4]",4,"[1.091953, 1.0919516, 1.0919464, 1.0919476]",1
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,[0],1,[1.091953],1
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,[0],1,[1.0919516],1
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,[0],1,[1.0919464],1
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,[0],1,[1.0919476],1
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,"[1, 2, 3]",3,"[1.01719, 1.0171872, 1.0172079]",1
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,[0],1,[1.01719],1
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,[0],1,[1.0171872],1
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,[0],1,[1.0172079],1
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,"[1, 2]",2,"[0.9621068, 0.9621068]",1


In [60]:
pi_bond = {'C': {'4': 0, '3': 1, '2': 2},
           'N': {'4': 0, '3': 0, '2': 1, '1': 2},
           'O': {'1': 1, '2': 0},
           'H': {'1': 0},
           'F': {'1': 0}}

pi_bond_ = []

for i in tqdm(range(len(structures))):
    pi_bond_.append(pi_bond[structures.loc[i, 'atom']][str(len(structures.loc[i, 'bonds']))])

structures['pi_bonds'] = pi_bond_



  0%|          | 0/2358657 [00:00<?, ?it/s]

  0%|          | 2611/2358657 [00:00<01:38, 23871.69it/s]

  0%|          | 5575/2358657 [00:00<01:35, 24756.42it/s]

  0%|          | 8261/2358657 [00:00<01:35, 24696.49it/s]

  0%|          | 11352/2358657 [00:00<01:31, 25667.70it/s]

  1%|          | 14430/2358657 [00:00<01:28, 26363.01it/s]

  1%|          | 17463/2358657 [00:00<01:27, 26758.84it/s]

  1%|          | 20539/2358657 [00:00<01:26, 27152.89it/s]

  1%|          | 23579/2358657 [00:00<01:25, 27342.55it/s]

  1%|          | 26513/2358657 [00:00<01:25, 27186.22it/s]

  1%|▏         | 29563/2358657 [00:01<01:25, 27392.44it/s]

  1%|▏         | 32272/2358657 [00:01<01:27, 26547.54it/s]

  1%|▏         | 35312/2358657 [00:01<01:26, 26909.23it/s]

  2%|▏         | 37977/2358657 [00:01<01:28, 26092.33it/s]

  2%|▏         | 40825/2358657 [00:01<01:29, 25984.65it/s]

  2%|▏         | 43904/2358657 [00:01<01:27, 26599.89it/s]

  2%|▏         | 46953/2358657 [00:01<01:25, 26970.50it/

 32%|███▏      | 753327/2358657 [00:28<01:02, 25504.95it/s]

 32%|███▏      | 755956/2358657 [00:28<01:02, 25729.90it/s]

 32%|███▏      | 758866/2358657 [00:28<01:01, 26104.68it/s]

 32%|███▏      | 761481/2358657 [00:28<01:01, 25800.29it/s]

 32%|███▏      | 764082/2358657 [00:29<01:02, 25583.95it/s]

 33%|███▎      | 766968/2358657 [00:29<01:01, 25819.47it/s]

 33%|███▎      | 769770/2358657 [00:29<01:01, 25758.81it/s]

 33%|███▎      | 772722/2358657 [00:29<01:00, 26116.15it/s]

 33%|███▎      | 775337/2358657 [00:29<01:00, 25977.50it/s]

 33%|███▎      | 777937/2358657 [00:29<01:01, 25670.81it/s]

 33%|███▎      | 780507/2358657 [00:29<01:01, 25597.45it/s]

 33%|███▎      | 783069/2358657 [00:29<01:01, 25446.22it/s]

 33%|███▎      | 785645/2358657 [00:29<01:01, 25533.44it/s]

 33%|███▎      | 788200/2358657 [00:30<01:02, 25231.09it/s]

 34%|███▎      | 790725/2358657 [00:30<01:05, 23938.93it/s]

 34%|███▎      | 793134/2358657 [00:30<01:06, 23487.79it/s]

 34%|███▎      | 795686/

 57%|█████▋    | 1353903/2358657 [00:57<00:40, 24744.01it/s]

 58%|█████▊    | 1356704/2358657 [00:57<00:40, 24996.30it/s]

 58%|█████▊    | 1359761/2358657 [00:57<00:38, 25814.67it/s]

 58%|█████▊    | 1362438/2358657 [00:57<00:39, 25397.72it/s]

 58%|█████▊    | 1365336/2358657 [00:57<00:38, 25717.51it/s]

 58%|█████▊    | 1368208/2358657 [00:58<00:38, 25878.77it/s]

 58%|█████▊    | 1370973/2358657 [00:58<00:38, 25696.20it/s]

 58%|█████▊    | 1373713/2358657 [00:58<00:38, 25499.33it/s]

 58%|█████▊    | 1376269/2358657 [00:58<00:39, 24820.62it/s]

 58%|█████▊    | 1379079/2358657 [00:58<00:39, 25075.63it/s]

 59%|█████▊    | 1381593/2358657 [00:58<00:40, 24408.35it/s]

 59%|█████▊    | 1384042/2358657 [00:58<00:42, 22733.02it/s]

 59%|█████▉    | 1386345/2358657 [00:58<00:43, 22202.55it/s]

 59%|█████▉    | 1389109/2358657 [00:58<00:42, 23041.89it/s]

 59%|█████▉    | 1391931/2358657 [00:59<00:40, 23805.59it/s]

 59%|█████▉    | 1394425/2358657 [00:59<00:41, 23495.59it/s]

 59%|███

 82%|████████▏ | 1931071/2358657 [01:26<00:15, 27319.45it/s]

 82%|████████▏ | 1934159/2358657 [01:26<00:15, 27428.29it/s]

 82%|████████▏ | 1937361/2358657 [01:26<00:15, 27958.77it/s]

 82%|████████▏ | 1940171/2358657 [01:26<00:14, 27986.34it/s]

 82%|████████▏ | 1942973/2358657 [01:26<00:14, 27828.23it/s]

 82%|████████▏ | 1945851/2358657 [01:26<00:14, 27748.98it/s]

 83%|████████▎ | 1948680/2358657 [01:27<00:14, 27908.62it/s]

 83%|████████▎ | 1951473/2358657 [01:27<00:15, 26468.71it/s]

 83%|████████▎ | 1954137/2358657 [01:27<00:15, 26514.38it/s]

 83%|████████▎ | 1956846/2358657 [01:27<00:15, 26646.64it/s]

 83%|████████▎ | 1959519/2358657 [01:27<00:15, 25983.46it/s]

 83%|████████▎ | 1962421/2358657 [01:27<00:15, 26349.81it/s]

 83%|████████▎ | 1965706/2358657 [01:27<00:14, 27052.43it/s]

 83%|████████▎ | 1968762/2358657 [01:27<00:14, 27312.80it/s]

 84%|████████▎ | 1971899/2358657 [01:27<00:13, 27709.52it/s]

 84%|████████▎ | 1974731/2358657 [01:27<00:13, 27833.38it/s]

 84%|███

In [61]:
structures.head(20)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,EN,rad,bonds,n_bonds,bond_lengths,hybri,pi_bond,pi_bonds
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.82,"[1, 2, 3, 4]",4,"[1.091953, 1.0919516, 1.0919464, 1.0919476]",3,0,0
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.43,[0],1,[1.091953],0,0,0
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.43,[0],1,[1.0919516],0,0,0
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.43,[0],1,[1.0919464],0,0,0
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.43,[0],1,[1.0919476],0,0,0
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.8,"[1, 2, 3]",3,"[1.01719, 1.0171872, 1.0172079]",3,0,0
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.43,[0],1,[1.01719],0,0,0
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.43,[0],1,[1.0171872],0,0,0
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.43,[0],1,[1.0172079],0,0,0
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.78,"[1, 2]",2,"[0.9621068, 0.9621068]",2,1,0


In [63]:
a = pd.DataFrame({'A': [1,2,3]})
a.to_csv(r'\\icnas4.cc.ic.ac.uk\fl4718\Desktop\Machine learning\Data\aaaaa.csv', index=False)

numpy.int8

In [10]:
atom = structures['n_bonds'].values