In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.AtomPairs import Pairs

In [4]:
qm9 = pd.read_csv("qm_ext_plus.csv").drop(columns=["Unnamed: 0"])
qm9 = qm9[qm9["num_atoms"]>1]

In [5]:
qm9.reset_index(drop=True, inplace=True)
qm9["mol"] = qm9["smiles"].apply(Chem.MolFromSmiles)

In [6]:
qm9

Unnamed: 0,smiles,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298,num_atoms,mol
0,C#C,0.000000,16.280000,-0.284500,0.050600,0.335100,0.026841,8.574,-77.308427,-77.305527,-77.304583,-77.327429,2,<rdkit.Chem.rdchem.Mol object at 0x7fbf0ab6d120>
1,C#N,2.893700,12.990000,-0.360400,0.019100,0.379600,0.016601,6.278,-93.411888,-93.409370,-93.408425,-93.431246,2,<rdkit.Chem.rdchem.Mol object at 0x7fbf0ab6d240>
2,C=O,2.108900,14.180000,-0.267000,-0.040600,0.226300,0.026603,6.413,-114.483613,-114.480746,-114.479802,-114.505268,2,<rdkit.Chem.rdchem.Mol object at 0x7fbf0ab6cdc0>
3,CC,0.000000,23.950000,-0.338500,0.104100,0.442600,0.074542,10.098,-79.764152,-79.760666,-79.759722,-79.787269,2,<rdkit.Chem.rdchem.Mol object at 0x7fbf0ab6d6c0>
4,CO,1.525800,16.970000,-0.265300,0.078400,0.343700,0.051208,8.751,-115.679136,-115.675816,-115.674872,-115.701876,2,<rdkit.Chem.rdchem.Mol object at 0x7fbf0ab6cb80>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157480,C(CO)NCC(Br)(Br)Br,2.441919,100.240063,-0.380273,0.084513,0.464786,0.123667,39.094,-8003.248302,-8008.439552,-8003.111700,-8003.166923,9,<rdkit.Chem.rdchem.Mol object at 0x7fbf098124a0>
157481,C(COCC(Br)(Br)Br)O,2.372214,96.252555,-0.402736,0.078920,0.481656,0.111772,37.424,-8023.067844,-8028.305366,-8022.943597,-8022.997937,9,<rdkit.Chem.rdchem.Mol object at 0x7fbf09812500>
157482,CCCCCC(Br)(Br)Br,1.803498,111.998541,-0.396852,0.082008,0.478860,0.158363,41.049,-7951.461993,-7956.529100,-7951.290358,-7951.345676,9,<rdkit.Chem.rdchem.Mol object at 0x7fbf09812560>
157483,CCCOCC(Br)(Br)Br,1.786861,104.298872,-0.399003,0.081802,0.480805,0.134611,39.290,-7987.256407,-7992.411611,-7987.108848,-7987.164149,9,<rdkit.Chem.rdchem.Mol object at 0x7fbf098125c0>


In [7]:
qm9.describe()

Unnamed: 0,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298,num_atoms
count,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0
mean,2.366532,76.54909,-0.24156,0.0093,0.250859,0.144141,31.63767,-513.489239,-513.540183,-513.477594,-513.520611,8.802241
std,1.980167,9.04045,0.029902,0.047783,0.056878,0.034169,4.041923,493.826785,494.198114,493.817005,493.817479,0.495855
min,-9.0951,12.99,-0.474639,-0.175,0.0246,0.00682,6.278,-15549.220985,-15558.550559,-15549.164761,-15549.222211,2.0
25%,1.3944,71.08,-0.254,-0.0258,0.2108,0.121945,28.987,-457.734806,-457.727369,-457.726425,-457.766684,9.0
50%,2.3329,76.44,-0.241,0.00854,0.2447,0.145376,31.583,-422.939218,-422.930687,-422.929742,-422.971671,9.0
75%,3.5435,82.0,-0.2268,0.046,0.2879,0.168641,34.319,-388.298857,-388.289547,-388.288603,-388.332809,9.0
max,29.5564,196.62,-0.1017,0.1935,0.6221,0.273944,46.969,-77.308427,-77.305527,-77.304583,-77.327429,9.0


In [8]:
fpgen = rdFingerprintGenerator.GetAtomPairGenerator(1,4)
qm9["apairs"] = qm9["mol"].apply(lambda x: fpgen.GetSparseCountFingerprint(x).GetNonzeroElements())

In [9]:
pairFps = qm9["apairs"]

In [10]:
pairFps_rev = pairFps.apply(lambda x:{k: 1/v for k, v in x.items()})

In [11]:
pairFps_rev

0                                             {804385: 1.0}
1                                            {1328673: 1.0}
2                                            {1721633: 1.0}
3                                             {541729: 1.0}
4                                            {1590305: 1.0}
                                ...                        
157480    {558145: 1.0, 558146: 1.0, 558147: 1.0, 590913...
157481    {558145: 1.0, 558146: 1.0, 558147: 1.0, 590913...
157482    {558113: 1.0, 558114: 1.0, 558115: 1.0, 558116...
157483    {558113: 1.0, 558114: 1.0, 558116: 1.0, 558145...
157484    {558114: 1.0, 558115: 1.0, 558116: 1.0, 558145...
Name: apairs, Length: 157485, dtype: object

In [12]:
unique_apairs = [list(y.keys()) for y in pairFps_rev]
unique_apairs = {x for y in unique_apairs for x in y} # flattening

In [13]:
len(unique_apairs)

1089

In [14]:
for k in unique_apairs:
    print(Pairs.ExplainPairScore(k))

(('C', 3, 0), 1, ('O', 1, 0))
(('C', 3, 0), 1, ('N', 1, 0))
(('N', 1, 1), 4, ('F', 1, 0))
(('C', 3, 0), 1, ('Cl', 1, 0))
(('N', 1, 1), 4, ('Cl', 1, 0))
(('C', 3, 0), 2, ('C', 1, 2))
(('C', 3, 0), 2, ('N', 1, 2))
(('C', 3, 0), 2, ('O', 1, 1))
(('C', 3, 0), 2, ('O', 1, 0))
(('C', 3, 0), 2, ('N', 1, 1))
(('C', 3, 0), 3, ('O', 1, 0))
(('C', 3, 0), 3, ('N', 1, 1))
(('C', 3, 0), 3, ('N', 1, 0))
(('C', 3, 0), 2, ('F', 1, 0))
(('C', 3, 0), 3, ('F', 1, 0))
(('C', 3, 0), 4, ('C', 1, 2))
(('C', 3, 0), 4, ('N', 1, 2))
(('C', 3, 0), 4, ('O', 1, 1))
(('C', 3, 0), 4, ('N', 1, 1))
(('C', 3, 0), 4, ('N', 1, 0))
(('C', 3, 0), 4, ('F', 1, 0))
(('C', 3, 0), 4, ('S', 1, 1))
(('C', 3, 0), 4, ('C', 1, 1))
(('C', 3, 0), 4, ('Cl', 1, 0))
(('N', 1, 0), 1, ('S', 4, 0))
(('N', 1, 0), 2, ('N', 3, 0))
(('N', 1, 0), 3, ('N', 1, 0))
(('N', 1, 0), 3, ('O', 2, 0))
(('N', 1, 0), 2, ('O', 1, 1))
(('N', 1, 0), 2, ('N', 1, 0))
(('N', 1, 0), 2, ('N', 1, 1))
(('N', 1, 0), 2, ('N', 2, 0))
(('N', 1, 0), 2, ('O', 2, 0))
(('N', 

In [15]:
mask = {k:[] for k in unique_apairs}
for k, v in mask.items():
    for apairs in pairFps_rev:
        if k in apairs.keys():
            mask[k].append(apairs[k])
        else:
            mask[k].append(0)

In [16]:
mask = pd.DataFrame(mask)

In [17]:
mask

Unnamed: 0,1590369,1066081,2115876,4211809,4213028,803938,1328226,1721442,1590370,1197154,...,1099043,1230115,705828,1099044,1230116,5785667,2115875,4213027,1198371,1198372
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
157481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [18]:
multiplied = mask.mul(qm9["mu"], axis=0)
multiplied

Unnamed: 0,1590369,1066081,2115876,4211809,4213028,803938,1328226,1721442,1590370,1197154,...,1099043,1230115,705828,1099044,1230116,5785667,2115875,4213027,1198371,1198372
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.813973,0.0,0.0,0.0,0.0
157481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [19]:
multiplied.mean()

1590369    0.272155
1066081    0.035778
2115876    0.005052
4211809    0.000015
4213028    0.005437
             ...   
5785667    0.000371
2115875    0.004135
4213027    0.004134
1198371    0.002166
1198372    0.003623
Length: 1089, dtype: float64

In [20]:
multiplied.mean().describe()

count    1089.000000
mean        0.049365
std         0.096551
min        -0.004489
25%         0.000035
50%         0.005772
75%         0.052531
max         0.803480
dtype: float64

In [21]:
qm9.columns[1:-3]

Index(['mu', 'alpha', 'homo', 'lumo', 'gap', 'zpve', 'cv', 'u0', 'u298',
       'h298', 'g298'],
      dtype='object')

In [22]:
apair_desc = pd.concat([mask.mul(qm9[col], axis=0).mean() for col in qm9.columns[1:-3]], axis=1)
apair_desc.columns = qm9.columns[1:-3]
apair_desc

Unnamed: 0,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298
1590369,0.272155,8.379943,-0.027654,0.002599,0.030254,0.017538,3.739596,-50.916594,-50.917141,-50.915448,-50.920346
1066081,0.035778,0.981727,-0.003184,0.000053,0.003237,0.001788,0.429354,-7.689053,-7.689531,-7.688904,-7.689478
2115876,0.005052,0.110241,-0.000388,-0.000079,0.000308,0.000149,0.045673,-0.817382,-0.817369,-0.817368,-0.817436
4211809,0.000015,0.010043,-0.000025,-0.000008,0.000017,0.000012,0.003408,-0.100048,-0.100048,-0.100047,-0.100052
4213028,0.005437,0.126272,-0.000570,0.000135,0.000705,0.000149,0.048249,-1.543282,-1.545939,-1.543122,-1.543194
...,...,...,...,...,...,...,...,...,...,...,...
5785667,0.000371,0.014477,-0.000053,0.000013,0.000067,0.000016,0.005150,-0.941465,-0.942098,-0.941446,-0.941454
2115875,0.004135,0.080581,-0.000271,-0.000058,0.000213,0.000109,0.033079,-0.582637,-0.582628,-0.582627,-0.582675
4213027,0.004134,0.089085,-0.000394,0.000095,0.000489,0.000107,0.034217,-1.096081,-1.097996,-1.095964,-1.096014
1198371,0.002166,0.047202,-0.000170,-0.000029,0.000140,0.000067,0.018491,-0.388271,-0.388397,-0.388260,-0.388288


In [23]:
apair_desc.describe()

Unnamed: 0,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298
count,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0
mean,0.049365,1.537361,-0.004834157,7.3e-05,0.004907469,0.002842747,0.637196,-10.336125,-10.337082,-10.33589,-10.336757
std,0.096551,2.952972,0.009277796,0.000991,0.009742666,0.005819546,1.233757,18.52854,18.529899,18.528116,18.529745
min,-0.004489,5.6e-05,-0.08067852,-0.004333,3.840048e-07,3.299997e-08,3.5e-05,-149.260188,-149.262576,-149.256884,-149.271158
25%,3.5e-05,0.056796,-0.004728144,-0.000109,0.0001592632,7.533504e-05,0.021883,-11.353234,-11.36,-11.352747,-11.353286
50%,0.005772,0.317764,-0.0009662986,-6e-06,0.0009031061,0.0004721299,0.124388,-2.628432,-2.628403,-2.628399,-2.62855
75%,0.052531,1.513653,-0.0001725855,3.1e-05,0.00461402,0.002543698,0.606518,-0.532606,-0.532598,-0.532597,-0.53264
max,0.80348,25.397385,-6.098359e-07,0.006493,0.08717192,0.05156667,10.664775,-0.000491,-0.000491,-0.000491,-0.000491


In [24]:
apair_desc

Unnamed: 0,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298
1590369,0.272155,8.379943,-0.027654,0.002599,0.030254,0.017538,3.739596,-50.916594,-50.917141,-50.915448,-50.920346
1066081,0.035778,0.981727,-0.003184,0.000053,0.003237,0.001788,0.429354,-7.689053,-7.689531,-7.688904,-7.689478
2115876,0.005052,0.110241,-0.000388,-0.000079,0.000308,0.000149,0.045673,-0.817382,-0.817369,-0.817368,-0.817436
4211809,0.000015,0.010043,-0.000025,-0.000008,0.000017,0.000012,0.003408,-0.100048,-0.100048,-0.100047,-0.100052
4213028,0.005437,0.126272,-0.000570,0.000135,0.000705,0.000149,0.048249,-1.543282,-1.545939,-1.543122,-1.543194
...,...,...,...,...,...,...,...,...,...,...,...
5785667,0.000371,0.014477,-0.000053,0.000013,0.000067,0.000016,0.005150,-0.941465,-0.942098,-0.941446,-0.941454
2115875,0.004135,0.080581,-0.000271,-0.000058,0.000213,0.000109,0.033079,-0.582637,-0.582628,-0.582627,-0.582675
4213027,0.004134,0.089085,-0.000394,0.000095,0.000489,0.000107,0.034217,-1.096081,-1.097996,-1.095964,-1.096014
1198371,0.002166,0.047202,-0.000170,-0.000029,0.000140,0.000067,0.018491,-0.388271,-0.388397,-0.388260,-0.388288


In [25]:
apair_desc.to_csv("apair_db.csv")