In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.AtomPairs import Pairs

In [2]:
qm9 = pd.read_csv("qm_ext_plus.csv").drop(columns=["Unnamed: 0"])
qm9 = qm9[qm9["num_atoms"]>1]

In [3]:
qm9.reset_index(drop=True, inplace=True)
qm9["mol"] = qm9["smiles"].apply(Chem.MolFromSmiles)

In [4]:
qm9

Unnamed: 0,smiles,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298,num_atoms,mol
0,C#C,0.000000,16.280000,-0.284500,0.050600,0.335100,0.026841,8.574,-77.308427,-77.305527,-77.304583,-77.327429,2,<rdkit.Chem.rdchem.Mol object at 0x7f06ace6da80>
1,C#N,2.893700,12.990000,-0.360400,0.019100,0.379600,0.016601,6.278,-93.411888,-93.409370,-93.408425,-93.431246,2,<rdkit.Chem.rdchem.Mol object at 0x7f066ab21af0>
2,C=O,2.108900,14.180000,-0.267000,-0.040600,0.226300,0.026603,6.413,-114.483613,-114.480746,-114.479802,-114.505268,2,<rdkit.Chem.rdchem.Mol object at 0x7f066ab21a80>
3,CC,0.000000,23.950000,-0.338500,0.104100,0.442600,0.074542,10.098,-79.764152,-79.760666,-79.759722,-79.787269,2,<rdkit.Chem.rdchem.Mol object at 0x7f066ab21d20>
4,CO,1.525800,16.970000,-0.265300,0.078400,0.343700,0.051208,8.751,-115.679136,-115.675816,-115.674872,-115.701876,2,<rdkit.Chem.rdchem.Mol object at 0x7f066ab21d90>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157480,C(CO)NCC(Br)(Br)Br,2.441919,100.240063,-0.380273,0.084513,0.464786,0.123667,39.094,-8003.248302,-8008.439552,-8003.111700,-8003.166923,9,<rdkit.Chem.rdchem.Mol object at 0x7f0667c61f50>
157481,C(COCC(Br)(Br)Br)O,2.372214,96.252555,-0.402736,0.078920,0.481656,0.111772,37.424,-8023.067844,-8028.305366,-8022.943597,-8022.997937,9,<rdkit.Chem.rdchem.Mol object at 0x7f0667c61fc0>
157482,CCCCCC(Br)(Br)Br,1.803498,111.998541,-0.396852,0.082008,0.478860,0.158363,41.049,-7951.461993,-7956.529100,-7951.290358,-7951.345676,9,<rdkit.Chem.rdchem.Mol object at 0x7f0667c62030>
157483,CCCOCC(Br)(Br)Br,1.786861,104.298872,-0.399003,0.081802,0.480805,0.134611,39.290,-7987.256407,-7992.411611,-7987.108848,-7987.164149,9,<rdkit.Chem.rdchem.Mol object at 0x7f0667c620a0>


In [5]:
qm9.describe()

Unnamed: 0,mu,alpha,homo,lumo,gap,zpve,cv,u0,u298,h298,g298,num_atoms
count,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0,157485.0
mean,2.366532,76.54909,-0.24156,0.0093,0.250859,0.144141,31.63767,-513.489239,-513.540183,-513.477594,-513.520611,8.802241
std,1.980167,9.04045,0.029902,0.047783,0.056878,0.034169,4.041923,493.826785,494.198114,493.817005,493.817479,0.495855
min,-9.0951,12.99,-0.474639,-0.175,0.0246,0.00682,6.278,-15549.220985,-15558.550559,-15549.164761,-15549.222211,2.0
25%,1.3944,71.08,-0.254,-0.0258,0.2108,0.121945,28.987,-457.734806,-457.727369,-457.726425,-457.766684,9.0
50%,2.3329,76.44,-0.241,0.00854,0.2447,0.145376,31.583,-422.939218,-422.930687,-422.929742,-422.971671,9.0
75%,3.5435,82.0,-0.2268,0.046,0.2879,0.168641,34.319,-388.298857,-388.289547,-388.288603,-388.332809,9.0
max,29.5564,196.62,-0.1017,0.1935,0.6221,0.273944,46.969,-77.308427,-77.305527,-77.304583,-77.327429,9.0


In [6]:
fpgen = rdFingerprintGenerator.GetAtomPairGenerator(1,4)
qm9["apairs"] = qm9["mol"].apply(lambda x: fpgen.GetSparseCountFingerprint(x).GetNonzeroElements())

In [7]:
pairFps = qm9["apairs"]

In [8]:
pairFps_rev = pairFps.apply(lambda x:[v for v in x.keys()])

In [9]:
pairs = []
for x in pairFps_rev:
    pairs.extend(x)
pairs = pd.Series(pairs)

In [10]:
(pairs.value_counts()>5).value_counts()

True     1024
False      65
dtype: int64

In [11]:
unique_apairs = [list(y.keys()) for y in pairFps]
unique_apairs = {x for y in unique_apairs for x in y} # flattening

In [12]:
len(unique_apairs)

1089

In [13]:
mask = {k:[] for k in unique_apairs}
for k, v in mask.items():
    for apairs in pairFps:
        if k in apairs.keys():
            mask[k].append(1/apairs[k])
        else:
            mask[k].append(0)

In [14]:
mask = pd.DataFrame(mask)

In [15]:
mask

Unnamed: 0,1590369,1066081,2115876,4211809,4213028,803938,1328226,1721442,1590370,1197154,...,1099043,1230115,705828,1099044,1230116,5785667,2115875,4213027,1198371,1198372
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
157481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
157483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [16]:
def generate_apairs_histograms(qmcol):
    multiplied = mask.mul(qm9[qmcol], axis=0)
    mus = []
    for col in multiplied.columns:
        idx = np.nonzero(multiplied[col].values)
        bins, edges = np.histogram(multiplied[col][idx[0]])
        mus.append(bins * np.diff(edges))
    mus = pd.DataFrame(mus)
    mus.columns = [f"{qmcol}_{x}" for x in range(10)]
    mus.index = multiplied.columns
    return mus

In [17]:
apairs = []
for qmcol in qm9.columns[1:-3]:
    apairs.append(generate_apairs_histograms(qmcol))
apairs = pd.concat(apairs, axis=1)

In [18]:
apairs

Unnamed: 0,mu_0,mu_1,mu_2,mu_3,mu_4,mu_5,mu_6,mu_7,mu_8,mu_9,...,g298_0,g298_1,g298_2,g298_3,g298_4,g298_5,g298_6,g298_7,g298_8,g298_9
1590369,209.440700,484.705620,24992.259530,27864.589130,1801.190020,17.952060,14.960050,14.960050,2.992010,5.984020,...,23770.113988,0.000000,0.000000,0.000000,0.000000,1584.674266,0.000000,792.337133,28524.136786,1.461862e+07
1066081,127.695840,383.087520,460.706560,1402.150400,2085.698720,628.463840,5.007680,7.511520,42.565280,30.046080,...,9429.184027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,14929.541376,1.599032e+06
2115876,9.810360,30.832560,35.037000,25.927380,25.226640,28.029600,15.416280,12.613320,5.605920,2.802960,...,1221.632917,2982.221532,4060.132929,1006.050637,0.000000,0.000000,0.000000,215.582279,251.512659,7.186076e+01
4211809,0.492410,1.477230,0.492410,0.492410,1.969640,0.984820,0.000000,1.477230,0.984820,0.984820,...,54.994968,0.000000,0.000000,0.000000,0.000000,384.964777,549.949681,0.000000,0.000000,5.499497e+01
4213028,0.959340,13.430760,51.804360,58.519740,55.641720,36.454920,23.983500,15.349440,6.715380,2.878020,...,2512.960392,143.597737,0.000000,0.000000,0.000000,5743.909467,10410.835908,0.000000,430.793210,6.461898e+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5785667,6.763762,9.018350,1.878823,0.000000,0.751529,0.375765,0.375765,0.375765,0.000000,1.127294,...,210.077949,30.011136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,630.233847,7.802895e+02
2115875,8.600150,12.569450,13.892550,17.861850,18.523400,16.538750,15.215650,11.907900,9.923250,5.292400,...,1253.503061,2859.553859,2624.522035,509.235619,0.000000,0.000000,0.000000,156.687883,0.000000,3.133758e+02
4213027,12.288960,15.903360,17.349120,20.963520,23.132160,18.072000,10.120320,11.566080,7.228800,5.060160,...,2325.895560,0.000000,0.000000,0.000000,75.028889,6827.628902,4576.762231,0.000000,300.115556,6.002311e+02
1198371,4.123518,7.559784,9.621543,9.621543,15.119568,13.745061,9.621543,0.687253,0.687253,0.687253,...,1312.177884,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1312.177884,2.466894e+04


In [19]:
apairs.describe()

Unnamed: 0,mu_0,mu_1,mu_2,mu_3,mu_4,mu_5,mu_6,mu_7,mu_8,mu_9,...,g298_0,g298_1,g298_2,g298_3,g298_4,g298_5,g298_6,g298_7,g298_8,g298_9
count,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,...,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0
mean,46.204452,230.32089,2638.344072,3675.211642,1673.940764,548.46838,193.497913,87.191173,30.474611,8.337463,...,5320.873781,1010.639343,531.001167,3071.636569,5679.162015,11037.14,25172.26,22344.75,59226.4,2184044.0
std,113.554377,723.841119,12086.205386,10103.964807,4139.608748,1345.846486,481.39132,236.637149,68.585016,10.829747,...,11508.931386,3636.425396,2617.436971,11900.653275,22654.178121,64428.54,88444.41,84820.0,218887.9,5889792.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.40746,3.7551,5.57754,7.79772,10.8657,9.0208,5.33505,2.21423,0.98185,1.88699,...,154.10489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146.8219
50%,8.7909,21.80464,43.63788,76.71565,106.134665,64.60197,28.5234,15.15358,6.60716,4.3868,...,1125.647406,0.0,0.0,0.0,0.0,0.0,0.0,67.60044,1468.005,2002.672
75%,35.0252,134.22959,378.6596,1423.03464,986.70264,375.88911,129.28808,55.00308,24.89388,10.75182,...,5495.461044,34.772683,0.0,34.044136,71.657928,1587.929,3942.992,5555.785,19417.29,786002.4
max,1379.44875,13087.05416,179240.832,95631.5413,39446.11545,15984.2272,6472.03056,2476.92256,846.70312,111.76002,...,121696.300354,40431.148963,29160.024455,135225.45697,247803.326509,1406628.0,1019163.0,1332251.0,4632937.0,60805120.0


In [21]:
apairs.iloc[:, 0:10].value_counts()

mu_0         mu_1         mu_2          mu_3          mu_4        mu_5        mu_6      mu_7       mu_8       mu_9    
0.000000     0.000000     0.000000      0.000000      0.000000    0.100000    0.00000   0.000000   0.000000   0.000000    35
                                                                  0.000000    0.00000   0.000000   0.000000   0.000000     3
0.555280     0.000000     0.000000      0.000000      0.000000    0.000000    0.00000   1.110560   1.110560   0.555280     3
0.884820     0.000000     1.769640      0.884820      0.000000    0.884820    1.76964   6.193740   2.654460   1.769640     2
0.000000     0.000000     0.000000      0.000000      0.100000    0.000000    0.00000   0.000000   0.000000   0.000000     2
                                                                                                                          ..
4.695920     25.827560    42.263280     81.004620     127.963820  102.136260  77.48268  34.045420  11.739800  7.043880     1
4.7194

In [22]:
apairs.to_csv("apair_hist_db.csv")