In [53]:
!pip install rdkit_pypi



In [54]:
import numpy as np
import pandas as pd

from rdkit.Avalon import pyAvalonTools
from rdkit import Chem
from rdkit.Chem import Crippen, Descriptors, GraphDescriptors, Lipinski, QED, rdMolDescriptors, Fragments, FragmentMatcher
from rdkit.Chem.EState.EState_VSA import VSA_EState_
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from xgboost import XGBRegressor, XGBClassifier

In [55]:
classification_df = pd.read_csv('classification_pretrained_dataset.csv')
classification_df.head()

Unnamed: 0,Activity,MW,AlogP,PSA,#ROTB,#ALERTS,MlogP,#HeavyAtoms,#NHOH,#AromaticHeterocycles,...,A_502,A_503,A_504,A_505,A_506,A_507,A_508,A_509,A_510,A_511
0,0,398.4,3.7,141.31,6.0,1.0,3.7,28.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,520.48,-1.13,206.3,9.0,4.0,-1.13,36.0,4.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0,305.5,3.2,33.12,0.0,1.0,3.2,13.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,342.27,2.11,73.1,6.0,5.0,2.11,19.0,2.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,461.47,-1.24,149.15,3.0,1.0,-1.24,33.0,5.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [56]:
X_class = classification_df.iloc[:, 1:].values

In [57]:
classification_scaler = StandardScaler().fit(X_class)

In [58]:
regression_df = pd.read_csv('regression_pretrained_dataset.csv')
regression_df.head()

Unnamed: 0,Class,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,c[X],E_1,E_2,E_3,E_4,E_5,E_6,E_7,E_8,Activity
0,0,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,0,0.0,0.0,52.176914,0.0,0.0,0.0,-6.391944,-1.451636,-1.6
1,0,275.34,1.67,4.0,3.0,106.39,3.0,2.0,1.67,77.32,...,0,0.0,19.201681,5.07883,12.969815,-0.134704,7.406556,0.0,1.464642,-1.57
2,0,415.29,4.98,5.0,2.0,75.11,4.0,1.0,4.98,90.27,...,0,76.40847,19.134407,11.887089,-2.200566,-1.765334,6.596278,-9.113484,-0.113526,-1.57
3,0,543.48,-5.6,17.0,8.0,285.14,9.0,1.0,-5.6,101.05,...,0,84.727417,11.389837,53.05147,0.0,-0.728697,-1.556118,-16.981647,-1.05756,-1.54
4,0,314.38,1.18,6.0,4.0,137.5,3.0,4.0,1.18,88.75,...,0,0.0,12.180389,16.443945,13.096557,0.335015,7.534264,1.814401,0.0,-1.54


In [59]:
X_reg = regression_df.iloc[:, :-1].values

In [60]:
regression_scaler = StandardScaler().fit(X_reg)

### **Data loading**

In [61]:
df = pd.read_csv('SMILES.txt', header=None, names=['SMILES'])
df.head()

Unnamed: 0,SMILES
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...
2,C#CCO
3,CCO
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12


### **RDKit descriptors calculation**

In [62]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    qed_vector = QED.properties(mol)
    df.at[i, 'MW'] = round(qed_vector[0], 2)
    df.at[i, 'AlogP'] = round(qed_vector[1], 2)
    df.at[i, '#HBA'] = qed_vector[2]
    df.at[i, '#HBD'] = qed_vector[3]
    df.at[i, 'PSA'] = qed_vector[4]
    df.at[i, '#ROTB'] = qed_vector[5]
    df.at[i, '#ALERTS'] = qed_vector[7]

    df.at[i, 'MlogP'] = round(Crippen.MolLogP(mol), 2)
    df.at[i, '#MR'] = round(Crippen.MolMR(mol), 2)

df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25


In [63]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, '#HeavyAtoms'] = Lipinski.HeavyAtomCount(mol)
    df.at[i, '#NHOH'] = Lipinski.NHOHCount(mol)
    df.at[i, '#NO'] = Lipinski.NOCount(mol)
    df.at[i, '#AromaticCarbocycles'] = Lipinski.NumAromaticCarbocycles(mol)
    df.at[i, '#AromaticHeterocycles'] = Lipinski.NumAromaticHeterocycles(mol)
    df.at[i, '#Heteroatoms'] = Lipinski.NumHeteroatoms(mol)

df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,#NHOH,#NO,#AromaticCarbocycles,#AromaticHeterocycles,#Heteroatoms
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,12.0,6.0,6.0,0.0,0.0,6.0
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,29.0,1.0,7.0,2.0,0.0,7.0
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,4.0,1.0,1.0,0.0,0.0,1.0
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,3.0,1.0,1.0,0.0,0.0,1.0
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,19.0,2.0,4.0,2.0,1.0,4.0


In [64]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, 'Morgan2'] =  round(Descriptors.FpDensityMorgan2(mol), 2)
    df.at[i, 'Morgan3'] =  round(Descriptors.FpDensityMorgan3(mol), 2)
    df.at[i, 'HeavyAtomMW'] =  round(Descriptors.HeavyAtomMolWt(mol), 2)
    df.at[i, 'MaxPartialCharge'] = Descriptors.MaxPartialCharge(mol)
    df.at[i, 'MinPartialCharge'] = Descriptors.MinPartialCharge(mol)
    df.at[i, '#ValenceElectrons'] = Descriptors.NumValenceElectrons(mol)

df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#NO,#AromaticCarbocycles,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,6.0,0.0,0.0,6.0,0.92,1.08,168.06,0.110579,-0.393579,74.0
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,...,7.0,2.0,0.0,7.0,1.66,2.24,374.24,0.220216,-0.492704,154.0
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,...,1.0,0.0,0.0,1.0,2.25,2.25,52.03,0.103468,-0.383666,22.0
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,...,1.0,0.0,0.0,1.0,2.0,2.0,40.02,0.040221,-0.396664,20.0
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,...,4.0,2.0,1.0,4.0,1.58,2.32,244.16,0.199993,-0.507966,94.0


In [65]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, 'BertzCT'] = round(GraphDescriptors.BertzCT(mol), 2)
    df.at[i, 'Kappa1'] = round(GraphDescriptors.Kappa1(mol), 2)
df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons,BertzCT,Kappa1
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,0.0,6.0,0.92,1.08,168.06,0.110579,-0.393579,74.0,104.61,11.76
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,...,0.0,7.0,1.66,2.24,374.24,0.220216,-0.492704,154.0,1001.39,20.51
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,...,0.0,1.0,2.25,2.25,52.03,0.103468,-0.383666,22.0,33.04,3.52
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,...,0.0,1.0,2.0,2.0,40.02,0.040221,-0.396664,20.0,2.75,2.96
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,...,1.0,4.0,1.58,2.32,244.16,0.199993,-0.507966,94.0,800.48,11.35


In [66]:
df_to_class_label_pred = df.copy()

### **Class label prediction**

In [67]:
def generate_AVfpts(data):
    Avalon_fpts = []
    mols = [Chem.MolFromSmiles(x) for x in data if x is not None]
    for mol in tqdm(mols):
        avfpts = pyAvalonTools.GetAvalonFP(mol, nBits=512)
        Avalon_fpts.append(avfpts)
    return np.array(Avalon_fpts)

In [68]:
Avalon_fpts = generate_AVfpts(df_to_class_label_pred['SMILES'])

100%|██████████| 50/50 [00:00<00:00, 732.71it/s]


In [69]:
for i, row in df_to_class_label_pred.iterrows():
    for j in range(Avalon_fpts.shape[1]):
        df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]

df_to_class_label_pred.head()

  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpts[i][j]
  df_to_class_label_pred.at[i, f'A_{j+1}'] = Avalon_fpt

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,A_503,A_504,A_505,A_506,A_507,A_508,A_509,A_510,A_511,A_512
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [70]:
df_to_class_label_pred = df_to_class_label_pred[['MW', 'AlogP', 'PSA', '#ROTB', '#ALERTS', 'MlogP', '#HeavyAtoms',
                                                 '#NHOH', '#AromaticHeterocycles', '#Heteroatoms', 'Morgan3', 'HeavyAtomMW',
                                                 'MaxPartialCharge', 'MinPartialCharge', 'Kappa1', 'A_1', 'A_3', 'A_4',
                                                 'A_5', 'A_6', 'A_7', 'A_11', 'A_13', 'A_16', 'A_19', 'A_20', 'A_21', 'A_24', 'A_25',
                                                 'A_26', 'A_27', 'A_28', 'A_29', 'A_30', 'A_32', 'A_33', 'A_35', 'A_36', 'A_38', 'A_39',
                                                 'A_42', 'A_43', 'A_44', 'A_45', 'A_46', 'A_49', 'A_50', 'A_51', 'A_52', 'A_53', 'A_56',
                                                 'A_57', 'A_58', 'A_59', 'A_60', 'A_61', 'A_62', 'A_63', 'A_64', 'A_66', 'A_68', 'A_70',
                                                 'A_71', 'A_72', 'A_73', 'A_74', 'A_76', 'A_77', 'A_78', 'A_79', 'A_80', 'A_83', 'A_84',
                                                 'A_85', 'A_86', 'A_87', 'A_88', 'A_90', 'A_91', 'A_92', 'A_93', 'A_94', 'A_96', 'A_97',
                                                 'A_98', 'A_99', 'A_100', 'A_101', 'A_103', 'A_105', 'A_106', 'A_108', 'A_111', 'A_112',
                                                 'A_113', 'A_114', 'A_115', 'A_116', 'A_120', 'A_121', 'A_122', 'A_123', 'A_124', 'A_125',
                                                 'A_126', 'A_127', 'A_128', 'A_130', 'A_133', 'A_134', 'A_135', 'A_136', 'A_137', 'A_140',
                                                 'A_141', 'A_142', 'A_143', 'A_144', 'A_147', 'A_148', 'A_149', 'A_150', 'A_151', 'A_152',
                                                 'A_153', 'A_154', 'A_156', 'A_157', 'A_158', 'A_159', 'A_160', 'A_161', 'A_163', 'A_164',
                                                 'A_165', 'A_166', 'A_167', 'A_168', 'A_169', 'A_171', 'A_172', 'A_173', 'A_174', 'A_175',
                                                 'A_177', 'A_178', 'A_179', 'A_181', 'A_183', 'A_185', 'A_188', 'A_190', 'A_191', 'A_192',
                                                 'A_194', 'A_196', 'A_197', 'A_198', 'A_200', 'A_201', 'A_204', 'A_205', 'A_207', 'A_208',
                                                 'A_209', 'A_210', 'A_212', 'A_213', 'A_214', 'A_215', 'A_216', 'A_217', 'A_218', 'A_219',
                                                 'A_220', 'A_221', 'A_223', 'A_225', 'A_226', 'A_227', 'A_228', 'A_229', 'A_230', 'A_234',
                                                 'A_237', 'A_238', 'A_239', 'A_241', 'A_242', 'A_244', 'A_245', 'A_247', 'A_248', 'A_249',
                                                 'A_250', 'A_251', 'A_252', 'A_253', 'A_254', 'A_257', 'A_259', 'A_260', 'A_262', 'A_264',
                                                 'A_265', 'A_267', 'A_268', 'A_270', 'A_271', 'A_273', 'A_274', 'A_275', 'A_276', 'A_277',
                                                 'A_278', 'A_281', 'A_283', 'A_284', 'A_285', 'A_287', 'A_288', 'A_290', 'A_291', 'A_292',
                                                 'A_293', 'A_294', 'A_295', 'A_296', 'A_297', 'A_299', 'A_300', 'A_303', 'A_304', 'A_305',
                                                 'A_307', 'A_308', 'A_309', 'A_311', 'A_312', 'A_313', 'A_314', 'A_315', 'A_316', 'A_317',
                                                 'A_320', 'A_322', 'A_324', 'A_325', 'A_326', 'A_329', 'A_330', 'A_333', 'A_334', 'A_336',
                                                 'A_338', 'A_340', 'A_342', 'A_343', 'A_344', 'A_345', 'A_347', 'A_348', 'A_349', 'A_350',
                                                 'A_351', 'A_352', 'A_353', 'A_356', 'A_357', 'A_358', 'A_359', 'A_362', 'A_363', 'A_365',
                                                 'A_366', 'A_369', 'A_371', 'A_372', 'A_373', 'A_374', 'A_375', 'A_376', 'A_377', 'A_379',
                                                 'A_382', 'A_385', 'A_386', 'A_390', 'A_391', 'A_393', 'A_394', 'A_396', 'A_397', 'A_398',
                                                 'A_399', 'A_400', 'A_402', 'A_403', 'A_404', 'A_405', 'A_406', 'A_407', 'A_408', 'A_410',
                                                 'A_411', 'A_412', 'A_414', 'A_415', 'A_417', 'A_418', 'A_419', 'A_420', 'A_421', 'A_422',
                                                 'A_423', 'A_424', 'A_425', 'A_427', 'A_428', 'A_429', 'A_430', 'A_432', 'A_433', 'A_434',
                                                 'A_435', 'A_437', 'A_438', 'A_440', 'A_441', 'A_442', 'A_443', 'A_444', 'A_445', 'A_447',
                                                 'A_448', 'A_450', 'A_452', 'A_453', 'A_454', 'A_455', 'A_456', 'A_457', 'A_458', 'A_459',
                                                 'A_460', 'A_461', 'A_462', 'A_463', 'A_464', 'A_465', 'A_466', 'A_468', 'A_469', 'A_470',
                                                 'A_471', 'A_473', 'A_474', 'A_475', 'A_476', 'A_477', 'A_478', 'A_479', 'A_480', 'A_481',
                                                 'A_482', 'A_483', 'A_484', 'A_485', 'A_486', 'A_487', 'A_488', 'A_489', 'A_490', 'A_491',
                                                 'A_492', 'A_493', 'A_494', 'A_495', 'A_496', 'A_497', 'A_498', 'A_499', 'A_500', 'A_501',
                                                 'A_502', 'A_503', 'A_504', 'A_505', 'A_506', 'A_507', 'A_508', 'A_509', 'A_510', 'A_511'
 ]
]

In [71]:
df_to_class_label_pred.head()

Unnamed: 0,MW,AlogP,PSA,#ROTB,#ALERTS,MlogP,#HeavyAtoms,#NHOH,#AromaticHeterocycles,#Heteroatoms,...,A_502,A_503,A_504,A_505,A_506,A_507,A_508,A_509,A_510,A_511
0,182.17,-3.59,121.38,5.0,0.0,-3.59,12.0,6.0,0.0,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,399.44,2.87,83.09,5.0,0.0,2.87,29.0,1.0,0.0,7.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,56.06,-0.39,20.23,0.0,1.0,-0.39,4.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,46.07,-0.0,20.23,0.0,0.0,-0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,254.24,2.87,70.67,1.0,0.0,2.87,19.0,2.0,1.0,4.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [72]:
classification_model = XGBClassifier()
classification_model.load_model('best_classification_model.json')

In [73]:
X = df_to_class_label_pred.iloc[:, :].values

In [74]:
X = classification_scaler.transform(X)

In [75]:
y_pred = classification_model.predict(X)

In [76]:
df['Class'] = y_pred

In [77]:
df[['SMILES', 'Class']].to_csv('Class_labels.csv', index=False)

### **LogBB prediction**

In [78]:
uniq_el = ['N', 'P', 'F', 'I', 'S', 'Br', 'Cl', 'B', 'O', 'C']
uniq_bond = ['DOUBLE', 'SINGLE', 'TRIPLE', 'AROMATIC']


In [79]:
df_sm = df['SMILES']

In [80]:
list_dop_descript = []
for mol_i in tqdm(df_sm):
    mol_sm = Chem.MolFromSmiles(mol_i)

    num_atom = mol_sm.GetNumAtoms()

    # the amount of each element in the molecule
    list_atomic_num = [x.GetSymbol() for x in mol_sm.GetAtoms()]
    list_count_atom = [list_atomic_num.count(x) for x in uniq_el]

    # number of each bond type and bond analysis
    list_bond = list(np.array([x.GetBondType() for x in mol_sm.GetBonds()], dtype=str))
    list_count_bond = [list_bond.count(x) for x in uniq_bond]
    RotB = Descriptors.NumRotatableBonds(mol_sm)
    ValEl = Descriptors.NumValenceElectrons(mol_sm)

    # analysis of rings and cycles
    RingCount = Descriptors.RingCount(mol_sm)
    NAr = Descriptors.NumAromaticRings(mol_sm)
    NSat = Descriptors.NumSaturatedRings(mol_sm)
    NAl = Descriptors.NumAliphaticRings(mol_sm)
    NArCH = Descriptors.NumAromaticHeterocycles(mol_sm)
    NArCC = Descriptors.NumAromaticCarbocycles(mol_sm)
    NSatCH = Descriptors.NumSaturatedHeterocycles(mol_sm)
    NSatCC = Descriptors.NumSaturatedCarbocycles(mol_sm)
    NAlCH = Descriptors.NumAliphaticHeterocycles(mol_sm)
    NAlCC = Descriptors.NumAliphaticCarbocycles(mol_sm)

    #analysis of some functional groups
    NHOH = Descriptors.NHOHCount(mol_sm)
    HAccept = Descriptors.NumHAcceptors(mol_sm)
    HDon = Descriptors.NumHDonors(mol_sm)
    SP3 = Descriptors.FractionCSP3(mol_sm)

    AlCOO = Fragments.fr_Al_COO(mol_sm)
    AlOH = Fragments.fr_Al_OH(mol_sm)
    AlOHnt = Fragments.fr_Al_OH_noTert(mol_sm)
    ArN = Fragments.fr_ArN(mol_sm)
    ArCOO = Fragments.fr_Ar_COO(mol_sm)
    Ar_N = Fragments.fr_Ar_N(mol_sm)
    ArNH = Fragments.fr_Ar_NH(mol_sm)
    ArOH = Fragments.fr_Ar_OH(mol_sm)
    COO = Fragments.fr_COO(mol_sm)
    CO = Fragments.fr_C_O(mol_sm)
    CO_clean = Fragments.fr_C_O_noCOO(mol_sm)
    CS = Fragments.fr_C_S(mol_sm)
    HOCCN = Fragments.fr_HOCCN(mol_sm)
    Imine = Fragments.fr_Imine(mol_sm)
    NHtert = Fragments.fr_NH0(mol_sm)
    NHsec = Fragments.fr_NH1(mol_sm)
    NHpri = Fragments.fr_NH2(mol_sm)
    NO = Fragments.fr_N_O(mol_sm)
    XCCNR = Fragments.fr_Ndealkylation1(mol_sm)
    TACA = Fragments.fr_Ndealkylation2(mol_sm)
    Npyrrl = Fragments.fr_Nhpyrrole(mol_sm)
    SH = Fragments.fr_SH(mol_sm)
    Ald = Fragments.fr_aldehyde(mol_sm)
    Alk_carb = Fragments.fr_alkyl_carbamate(mol_sm)
    Alk_hal = Fragments.fr_alkyl_halide(mol_sm)
    Alk_oxid = Fragments.fr_allylic_oxid(mol_sm)
    Amide = Fragments.fr_amide(mol_sm)
    Amidine = Fragments.fr_amidine(mol_sm)
    Aniline = Fragments.fr_aniline(mol_sm)
    Ar_methyl = Fragments.fr_aryl_methyl(mol_sm)
    Azide = Fragments.fr_azide(mol_sm)
    Azo = Fragments.fr_azo(mol_sm)
    Barb = Fragments.fr_barbitur(mol_sm)
    Benz = Fragments.fr_benzene(mol_sm)
    BenzD = Fragments.fr_benzodiazepine(mol_sm)
    Bicycl = Fragments.fr_bicyclic(mol_sm)
    Diazo = Fragments.fr_diazo(mol_sm)
    DHpyridine = Fragments.fr_dihydropyridine(mol_sm)
    Epoxide = Fragments.fr_epoxide(mol_sm)
    Ester = Fragments.fr_ester(mol_sm)
    EtherO = Fragments.fr_ether(mol_sm)
    Furan = Fragments.fr_furan(mol_sm)
    Guanid = Fragments.fr_guanido(mol_sm)
    Hal = Fragments.fr_halogen(mol_sm)
    Hdrzine = Fragments.fr_hdrzine(mol_sm)
    Hdrzone = Fragments.fr_hdrzone(mol_sm)
    Imidazole = Fragments.fr_imidazole(mol_sm)
    Imide = Fragments.fr_imide(mol_sm)
    RNCO = Fragments.fr_isocyan(mol_sm)
    RNCS = Fragments.fr_isothiocyan(mol_sm)
    Ket = Fragments.fr_ketone(mol_sm)
    Ket2 = Fragments.fr_ketone_Topliss(mol_sm)
    Lactam = Fragments.fr_lactam(mol_sm)
    Lactone = Fragments.fr_lactone(mol_sm)
    OMe = Fragments.fr_methoxy(mol_sm)
    Morphlne = Fragments.fr_morpholine(mol_sm)
    RCN = Fragments.fr_nitrile(mol_sm)
    Nitro = Fragments.fr_nitro(mol_sm)
    NitroAr = Fragments.fr_nitro_arom(mol_sm)
    NitroArno = Fragments.fr_nitro_arom_nonortho(mol_sm)
    RNO = Fragments.fr_nitroso(mol_sm)
    Oxaz = Fragments.fr_oxazole(mol_sm)
    Oxime = Fragments.fr_oxime(mol_sm)
    paraHX = Fragments.fr_para_hydroxylation(mol_sm)
    PhOH = Fragments.fr_phenol(mol_sm)
    PhOHno = Fragments.fr_phenol_noOrthoHbond(mol_sm)
    PAcide = Fragments.fr_phos_acid(mol_sm)
    PEster = Fragments.fr_phos_ester(mol_sm)
    Piprdn = Fragments.fr_piperdine(mol_sm)
    Piprzn = Fragments.fr_piperzine(mol_sm)
    Priamide = Fragments.fr_priamide(mol_sm)
    Prisulfamd = Fragments.fr_prisulfonamd(mol_sm)
    PyrRing = Fragments.fr_pyridine(mol_sm)
    Nquat = Fragments.fr_quatN(mol_sm)
    Sether = Fragments.fr_sulfide(mol_sm)
    Sulfamd = Fragments.fr_sulfonamd(mol_sm)
    Sulfone = Fragments.fr_sulfone(mol_sm)
    TermCHCH = Fragments.fr_term_acetylene(mol_sm)
    Tetrazol = Fragments.fr_tetrazole(mol_sm)
    Thiazole = Fragments.fr_thiazole(mol_sm)
    RSCN = Fragments.fr_thiocyan(mol_sm)
    Thioph = Fragments.fr_thiophene(mol_sm)
    UNb_alk = Fragments.fr_unbrch_alkane(mol_sm)
    Urea = Fragments.fr_urea(mol_sm)

    #analysis of the molecule as a whole
    TPSA = Descriptors.TPSA(mol_sm)
    LogP = Descriptors.MolLogP(mol_sm)

    # LabuteASA = Descriptors.LabuteASA(mol_sm)
    # SlogP_VSA0 = Descriptors.SlogP_VSA0(mol_sm)
    # print(type(SlogP_VSA1),SlogP_VSA1, LogP)

    list_desc = list_count_atom + list_count_bond + [RotB, ValEl, num_atom, RingCount, NAr, NSat, NAl,
               NArCH, NArCC, NSatCH, NSatCC, NAlCH, NAlCC, NHOH, HAccept, HDon, SP3,
               TPSA, LogP,
               AlCOO, AlOH, AlOHnt, ArN, ArCOO, Ar_N, ArNH, ArOH, COO, CO, CO_clean, CS, HOCCN,
               Imine, NHtert, NHsec, NHpri, NO, XCCNR, TACA, Npyrrl, SH, Ald, Alk_carb, Alk_hal,
               Alk_oxid, Amide, Amidine, Aniline, Ar_methyl, Azide, Azo, Barb, Benz, BenzD,
               Bicycl, Diazo, DHpyridine, Epoxide, Ester, EtherO, Furan, Guanid, Hal, Hdrzine,
               Hdrzone, Imidazole, Imide, RNCO, RNCS, Ket, Ket2, Lactam, Lactone, OMe, Morphlne,
               RCN, Nitro, NitroAr, NitroArno, RNO, Oxaz, Oxime, paraHX, PhOH, PhOHno, PAcide,
               PEster, Piprdn, Piprzn, Priamide, Prisulfamd, PyrRing, Nquat, Sether, Sulfamd,
               Sulfone, TermCHCH, Tetrazol, Thiazole, RSCN, Thioph, UNb_alk, Urea]
    list_dop_descript.append(list_desc)

list_dop_descript = np.array(list_dop_descript)

100%|██████████| 50/50 [00:00<00:00, 201.28it/s]


In [81]:
dop_desc_name = uniq_el + uniq_bond + ['RotB', 'ValEl', 'num_atom', 'RingCount', 'NAr', 'NSat', 'NAl',
            'NArCH', 'NArCC', 'NSatCH', 'NSatCC', 'NAlCH', 'NAlCC', 'NHOH', 'HAccept', 'HDon', 'SP3',
            'TPSA', 'LogP',
            'AlCOO', 'AlOH', 'AlOHnt', 'ArN', 'ArCOO', 'Ar_N', 'ArNH', 'ArOH', 'COO',
            'CO', 'CO_clean', 'CS', 'HOCCN',
            'Imine', 'NHtert', 'NHsec', 'NHpri', 'NO', 'XCCNR', 'TACA', 'Npyrrl', 'SH',
            'Ald', 'Alk_carb', 'Alk_hal',
            'Alk_oxid', 'Amide', 'Amidine', 'Aniline', 'Ar_methyl', 'Azide', 'Azo', 'Barb', 'Benz', 'BenzD',
            'Bicycl', 'Diazo', 'DHpyridine', 'Epoxide', 'Ester',
            'EtherO', 'Furan', 'Guanid', 'Hal', 'Hdrzine',
            'Hdrzone', 'Imidazole', 'Imide', 'RNCO', 'RNCS', 'Ket',
            'Ket2', 'Lactam', 'Lactone', 'OMe', 'Morphlne',
            'RCN', 'Nitro', 'NitroAr', 'NitroArno', 'RNO', 'Oxaz', 'Oxime', 'paraHX', 'PhOH', 'PhOHno', 'PAcide',
            'PEster', 'Piprdn', 'Piprzn', 'Priamide', 'Prisulfamd', 'PyrRing', 'Nquat', 'Sether', 'Sulfamd',
            'Sulfone', 'TermCHCH', 'Tetrazol', 'Thiazole', 'RSCN', 'Thioph', 'UNb_alk', 'Urea']

dop_desc_name_new = [str(x) + '_dop' for x in dop_desc_name]
print(dop_desc_name_new)


['N_dop', 'P_dop', 'F_dop', 'I_dop', 'S_dop', 'Br_dop', 'Cl_dop', 'B_dop', 'O_dop', 'C_dop', 'DOUBLE_dop', 'SINGLE_dop', 'TRIPLE_dop', 'AROMATIC_dop', 'RotB_dop', 'ValEl_dop', 'num_atom_dop', 'RingCount_dop', 'NAr_dop', 'NSat_dop', 'NAl_dop', 'NArCH_dop', 'NArCC_dop', 'NSatCH_dop', 'NSatCC_dop', 'NAlCH_dop', 'NAlCC_dop', 'NHOH_dop', 'HAccept_dop', 'HDon_dop', 'SP3_dop', 'TPSA_dop', 'LogP_dop', 'AlCOO_dop', 'AlOH_dop', 'AlOHnt_dop', 'ArN_dop', 'ArCOO_dop', 'Ar_N_dop', 'ArNH_dop', 'ArOH_dop', 'COO_dop', 'CO_dop', 'CO_clean_dop', 'CS_dop', 'HOCCN_dop', 'Imine_dop', 'NHtert_dop', 'NHsec_dop', 'NHpri_dop', 'NO_dop', 'XCCNR_dop', 'TACA_dop', 'Npyrrl_dop', 'SH_dop', 'Ald_dop', 'Alk_carb_dop', 'Alk_hal_dop', 'Alk_oxid_dop', 'Amide_dop', 'Amidine_dop', 'Aniline_dop', 'Ar_methyl_dop', 'Azide_dop', 'Azo_dop', 'Barb_dop', 'Benz_dop', 'BenzD_dop', 'Bicycl_dop', 'Diazo_dop', 'DHpyridine_dop', 'Epoxide_dop', 'Ester_dop', 'EtherO_dop', 'Furan_dop', 'Guanid_dop', 'Hal_dop', 'Hdrzine_dop', 'Hdrzone_dop'

In [82]:
df_dop_desc = pd.DataFrame(list_dop_descript, columns=np.array(dop_desc_name_new, dtype=str))

In [83]:
columns = list(df_dop_desc.columns.values)
columns = [columns[-1]] + columns[:-1]
df_dop_desc = df_dop_desc.loc[:, columns]
df_dop_desc.head(5)

Unnamed: 0,Urea_dop,N_dop,P_dop,F_dop,I_dop,S_dop,Br_dop,Cl_dop,B_dop,O_dop,...,Nquat_dop,Sether_dop,Sulfamd_dop,Sulfone_dop,TermCHCH_dop,Tetrazol_dop,Thiazole_dop,RSCN_dop,Thioph_dop,UNb_alk_dop
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
fragments_list = np.load('BBB_features.npy')
fgraments_list = fragments_list[-474:]
fgraments_list

array(['cc-c(c)ccc', 'ccccs', 'ccccN', 'cccnc', 'cccc(c)-cn', 'csccn',
       'c1cscn1', 'ccsc(n)N', 'cncN', 'cnc(c)-c', 'cccc-c(c)n', 'ccncN',
       'ccc(c)-c(c)n', 'ccc-c(c)nc', 'cccncN', 'ccc(c)-cnc',
       'cnc(c)-c(c)c', 'ccccNC', 'cc(c)CN', 'cnccn', 'ccccn', 'cccc(c)nc',
       'cncNCC', 'cccc(c)CN', 'cnc(c)cn', 'ccccnc', 'ccc(cc)CN',
       'cnc(c)c(c)n', 'cccncNC', 'ccccncN', 'CCO', 'CCOC', 'COC(C)O',
       'CCCO', 'OCCO', 'CCC(C)O', 'CC[C@@H](C)O', 'CCCOC', 'CO[C@H](C)C',
       'CCCOCC', 'cccc(c)N', 'ccccc(c)N', 'cc(-c)ccN', 'ccccc(c)O',
       'cccc(c)CC', 'cccc(c)OC', 'CCC(C)C', 'ccc(c)CCC', 'ccc(c)OCC',
       'cc(c)CCCC', 'CCC(C)CC', 'cc(c)OCCC', 'CCCCO', 'CCC(C)COc',
       'cc(O)cccC', 'cccco', 'cccc(c)cc', 'ccc(cc)OC', 'ccccNCC',
       'cccc(c)NC', 'ccc(N)nc', 'ccccN(C)C', 'CNCCN', 'CCN(C)C',
       'ccc(c)CNC', 'ccc(c)NCC', 'cccncc', 'ccc(n)NCC', 'cc(n)N(C)CC',
       'CNCCN(c)C', 'cccN(C)CC', 'ccc(c)N(C)C', 'CCNCCN', 'CCNCC',
       'CCN(C)CC', 'cc(c)CNCC', 'cc(c

In [85]:
list_main_descript = []
for mol_i in tqdm(df_sm):
    mol_sm = Chem.MolFromSmiles(mol_i)
    mol_frags_list = []
    for ii in range(len(fgraments_list)):
        p = FragmentMatcher.FragmentMatcher()
        p.Init(fgraments_list[ii])
        mol_frags_list.append(len(p.GetMatches(mol_sm)))
    list_main_descript.append(mol_frags_list)

list_main_descript = np.array(list_main_descript)
print(list_main_descript.shape)

100%|██████████| 50/50 [00:00<00:00, 73.10it/s]

(50, 474)





In [86]:
df_main_desc = pd.DataFrame(list_main_descript, columns=fgraments_list)
df_main_desc.describe()

Unnamed: 0,cc-c(c)ccc,ccccs,ccccN,cccnc,cccc(c)-cn,csccn,c1cscn1,ccsc(n)N,cncN,cnc(c)-c,...,CCCCC(c)N,ccC(N)CCC,CNC(C)C(c)N,ccc(S)c(c)N,cc(S)c(c)NC,CCC(c)[C@@H](C)C,CCCCNCN,CC(C)CNCN,C[X],c[X]
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,...,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.42,0.04,0.74,1.18,0.18,0.02,0.02,0.02,0.22,0.12,...,0.0,0.0,0.0,0.04,0.04,0.02,0.04,0.02,0.22,0.1
std,1.61738,0.282843,1.536096,2.421755,0.896478,0.141421,0.141421,0.141421,0.648074,0.435187,...,0.0,0.0,0.0,0.282843,0.282843,0.141421,0.282843,0.141421,0.763718,0.364216
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.0,2.0,6.0,12.0,6.0,1.0,1.0,1.0,3.0,2.0,...,0.0,0.0,0.0,2.0,2.0,1.0,2.0,1.0,3.0,2.0


In [87]:
df_full = pd.concat([df_dop_desc, df_main_desc], axis=1)
df_full.shape

(50, 591)

In [88]:
df = pd.concat([df, df_full], axis=1)
df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,CCCCC(c)N,ccC(N)CCC,CNC(C)C(c)N,ccc(S)c(c)N,cc(S)c(c)NC,CCC(c)[C@@H](C)C,CCCCNCN,CC(C)CNCN,C[X],c[X]
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,0,0,0,0,0,0,0,0,0,0
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,...,0,0,0,0,0,0,0,0,0,0
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,...,0,0,0,0,0,0,0,0,0,0
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,...,0,0,0,0,0,0,0,0,0,0
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,...,0,0,0,0,0,0,0,0,0,0


In [89]:
def generate_estate_fpts(data):
    estate_fpts = []
    mols = [Chem.MolFromSmiles(x) for x in data if x is not None]
    for mol in tqdm(mols):
        estatefpts = VSA_EState_(mol)
        estate_fpts.append(estatefpts)
    return np.array(estate_fpts)

In [90]:
estate_fpts = generate_estate_fpts(df['SMILES'])

100%|██████████| 50/50 [00:00<00:00, 734.63it/s]


In [91]:
for i, row in df.iterrows():
    for j in range(estate_fpts.shape[1]):
        df.at[i, f'E_{j+1}'] = estate_fpts[i][j]

df.head()

Unnamed: 0,SMILES,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,E_1,E_2,E_3,E_4,E_5,E_6,E_7,E_8,E_9,E_10
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,0.0,0.0,52.176914,0.0,0.0,0.0,-6.391944,-1.451636,0.0,0.0
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,109.23,...,21.995594,24.480585,2.964085,3.01027,1.604915,6.569938,1.268552,1.464264,6.141799,0.0
2,C#CCO,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,15.92,...,0.0,0.0,7.638889,0.0,1.986111,0.0,4.527778,-0.152778,0.0,0.0
3,CCO,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,12.76,...,0.0,0.0,7.569444,0.0,0.0,0.0,0.0,1.930556,0.0,0.0
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,71.25,...,5.366593,12.320988,18.99992,1.249243,0.186928,10.685367,1.357627,0.0,0.0,0.0


In [92]:
df = df[['SMILES', 'Class', 'MW', 'AlogP', '#HBA', '#HBD', 'PSA', '#ROTB', '#ALERTS', 'MlogP', '#MR', '#HeavyAtoms', '#NHOH', '#NO',
         '#AromaticCarbocycles', '#AromaticHeterocycles', '#Heteroatoms', 'Morgan2', 'HeavyAtomMW', 'MaxPartialCharge', 'MinPartialCharge',
         '#ValenceElectrons', 'BertzCT', 'Kappa1', 'N_dop', 'Br_dop', 'C_dop', 'O_dop', 'S_dop', 'F_dop', 'Cl_dop', 'AROMATIC_dop',
         'SINGLE_dop', 'TRIPLE_dop', 'num_atom_dop', 'AlCOO_dop', 'AlOH_dop', 'AlOHnt_dop', 'ArN_dop', 'Ar_N_dop', 'ArNH_dop', 'ArOH_dop',
         'COO_dop', 'CO_dop', 'CO_clean_dop', 'Imine_dop', 'NHtert_dop', 'NHsec_dop', 'NHpri_dop', 'XCCNR_dop', 'TACA_dop', 'Npyrrl_dop',
         'Alk_hal_dop', 'Alk_oxid_dop', 'Amide_dop', 'Aniline_dop', 'Ar_methyl_dop', 'Bicycl_dop', 'Ester_dop', 'EtherO_dop', 'Furan_dop',
         'Imidazole_dop', 'Imide_dop', 'Ket_dop', 'OMe_dop', 'RCN_dop', 'Nitro_dop', 'paraHX_dop', 'PhOH_dop', 'PhOHno_dop', 'Piprzn_dop',
         'Priamide_dop', 'Sulfamd_dop', 'Urea_dop', 'cc-c(c)ccc', 'ccccs', 'csccn', 'c1cscn1', 'ccsc(n)N', 'cncN', 'cnc(c)-c', 'cccc-c(c)n',
         'ccncN', 'ccc(c)-c(c)n', 'ccc(c)-cnc', 'cnc(c)-c(c)c', 'ccccNC', 'cc(c)CN', 'cnccn', 'ccccn', 'cccc(c)nc', 'cncNCC', 'cccc(c)CN',
         'ccccnc', 'ccc(cc)CN', 'cccncNC', 'ccccncN', 'CCOC', 'COC(C)O', 'CCCO', 'OCCO', 'CCC(C)O', 'CC[C@@H](C)O', 'CCCOC', 'CO[C@H](C)C',
         'CCCOCC', 'cccc(c)N', 'ccccc(c)N', 'cc(-c)ccN', 'ccccc(c)O', 'cccc(c)CC', 'cccc(c)OC', 'CCC(C)C', 'ccc(c)CCC', 'ccc(c)OCC',
         'cc(c)CCCC', 'CCC(C)CC', 'cc(c)OCCC', 'CCCCO', 'CCC(C)COc', 'cc(O)cccC', 'cccco', 'cccc(c)cc', 'ccc(cc)OC', 'ccccNCC', 'cccc(c)NC',
         'ccc(N)nc', 'ccccN(C)C', 'CNCCN', 'CCN(C)C', 'ccc(c)CNC', 'ccc(c)NCC', 'ccc(n)NCC', 'cc(n)N(C)CC', 'CNCCN(c)C', 'cccN(C)CC',
         'ccc(c)N(C)C', 'CCNCCN', 'CCNCC', 'cc(c)CN(C)C', 'ccc(c)c(c)[nH]', 'cc(n)N(C)C', 'CN(cn)CCN', 'ccN(C)CCN', 'ccN(CC)CC',
         'cc(c)N(C)CC', 'CCN(C)CCN', 'ccCN(C)CC', 'c1cc[nH]c1', 'cccc(n)NC', 'C1CNCCN1', 'cccc([nH])cc', 'ccncC', 'cccncn', 'cncCN',
         'cnc(c)C', 'cc(C)cn', 'cnc(c)n', 'ccnc(c)C', 'cnc(C)cn', 'cnc(n)cn', 'Cc1nccn1', 'ccnccC', 'Cccncn', 'nccncn', 'C=CC', 'C=CCC',
         'ccc(C)c(c)O', 'cc(C)c(c)N', 'CC=C(C)CCC', 'COc(cC)cc', 'cccNCCC', 'cc(C)c(c)NC', 'CCCCNC', 'C=C(C)CCCC', 'CCC[C@H](C)C',
         'cc(c)NCCC', 'CCCCCC', 'CCCC(C)CC', 'CCC[C@@H](C)CC', 'CCCCCCC', 'cc(c)Cc(c)c', 'cccc(C)cC', 'ccc[C@H](C)O', 'cccCOC', 'ccc(cC)CC',
         'cc(C)cc(c)O', 'cc(c)C[C@@H](C)C', 'ccCC[C@@H](C)C', 'c1cCCCC1', 'cc(cC)CCC', 'cc(C)c(c)CC', 'cc(cCC)CC', 'cc(C)ccOC', 'cc(ccO)CC',
         'cccCOCC', 'CCCCOC', 'CCOC(C)C', 'CccCCCC', 'CCcccOC', 'cnc(n)N', 'ccc(N)c(c)C', 'cc-ccnc', 'cc-c(cc)nc', 'ccc-c(c)cc', 'cccnC',
         'cnCOCC', 'ccn(c)CC', 'ccn(C)cn', 'cccn(c)C', 'cn(c)CCC', 'ccn(c)C', 'cccn(C)cn', 'cncn(c)CC', 'cccn(c)CC', 'ccnCCCC', 'C1CCOC1',
         'CCC[C@@H](C)O', 'CCCCCO', 'cc(c)Oc(c)c', 'ccc(c)CCN', 'CNC(C)C', 'CNCCO', 'ccc-ccC', 'cc(c)CCNC', 'CN[C@@H](C)C', 'CNC[C@@H](C)NC',
         'CCCNC', 'cc(c)OCCN', 'CCOCCN', 'CNCCCN', 'CCNC(C)C', 'CCCNCC', 'CCCCN(C)C', 'CCN[C@H](C)CN', 'CCCNCCN', 'CCCCNCC', 'CN(C)CCCN',
         'cc(c)-n(c)cn', 'Ccncn', 'ncncn', 'ccc(c)CCO', 'ccc(c)c(c)C', 'cc(C)cc(c)[nH]', 'cc(C)cccC', 'ccc(c)ccC', 'ccc[nH]cC', 'cc(CC)c(c)c',
         'cc(cC)CCO', 'Cccccc[nH]', 'cc(C)cc(c)c', 'CCS', 'CCSC', 'CC=N', 'C=NC', 'N=C(N)N', 'CCN=CN', 'C=NCC', 'ccCSCCN', 'CC[C@H](C)NC',
         'c1c[nH]cn1', 'CCC(C)(C)CC', 'C=CC(C)CCC', 'C=CCC(C)CC', 'CCC(C)C(C)C', 'C=C(C)C(C)C', 'C=C(C)C(C)CC', 'C=CC(C)(C)CC', 'C=C(CC)C(C)C',
         'CCC(CC)CC', 'C=CC(C)C(C)C', 'C1CCCCC1', 'C=CC(CC)CC', 'ccccc(c)S', 'NS', 'ccc[nH]c', 'CNS', 'ccc[nH]cn', 'cc-cnn', 'ccnc(n)CC',
         'c1cnc[nH]c1', 'CCN(CC)CC', 'ccc(c)C(C)C', 'CC(C)cn', 'cc(c)C(C)CC', 'Cncccn', 'cc(n)cn(c)C', 'ccc(c)c(c)c', 'CCNCN', 'CCNC=Nc',
         'CNC(=N)N', 'cN=CNCCN', 'CCNC(=N)N', 'ccc(C)c(c)N', 'CCcnC', 'CcnCC', 'ccnc(c)-c', 'cccc(c)nC', 'ccc-c(n)cc', 'cc-c(c)ncc',
         'ccc(c)-c(c)c', 'CccccnC', 'ccccnCC', 'CNCCCO', 'CCNCCCO', 'CCC[C@H](C)NC', 'CCN(C)[C@@H](C)C', 'CC[C@@H](O)CCN', 'CC[C@@H](CC)NC',
         'CN1CCCC1', 'CCCC[C@@H](C)O', 'C1CCNCC1', 'CCCC[C@H](C)N', 'CCN[C@@H](C)CC', 'CCCN[C@@H](C)C', 'CCCNCCC', 'ccCNCcc', 'cnccCCC',
         'c1cnnc1', 'cc(o)CN(C)C', 'cccCS', 'ccNCCCC', 'cncNCCC', 'cc(C)cc(c)C', 'cncccN', 'cNCCN(C)C', 'cCC(CC)CC', 'ccCCNc', 'cc(N)c(c)CC',
         'cc(c)CC(C)N', 'cccNC(C)C', 'cc(cN)CCC', 'cc(C)cNCC', 'C[C@H](C)C(C)CO', 'cc(c)CCCN', 'ccc(c)n(c)C', 'cnc(C)nC', 'CCC(C)CNC',
         'CC(c)C(C)CN', 'cCCCN(C)C', 'CC(C)CN(C)C', 'CccCCCN', 'ccCCCNC', 'CCNCC(C)C', 'ccc(n)c(c)n', 'cc(C)cccn', 'ccc(C)ccn', 'ccCCN(C)C',
         'ccc(c)/C=C\\C', 'cc(C)cCCN', 'ccCNCCC', 'ccCNC(C)C', 'C=CCNC', 'C=CCN(C)C', 'C=CCCN(C)C', 'CC(N)CN(C)C', 'C=CCCNCC', 'cCC(C)N(C)C',
         'C=CN', 'CNC(C)C(C)C', 'CC(C)CCCN', 'CCCC(C)CN', 'CC=CC(C)CC', 'CCC(C)C(C)N', 'CCC(CC)CN', 'ccC(C)CCC', 'ccC(C)CNC', 'CCNCC(c)C',
         'CCN(C)CCO', 'CC(C)C(C)CN', 'cc(c)cccn', 'cccnc(c)c', 'ccNCC(C)C', 'CCC(N)CCN', 'CNCCCCN', 'ccCC(C)NC', 'ccC(C)CCN', 'CCC(C)CNc',
         'CC(N)CCN', 'CC(CN)CCN', 'ccC(C)(C)CC', 'ccCC(N)CC', 'cc(N)cccO', 'ccOCCCN', 'cc(c)NCCN', 'ccc(c)C(C)N', 'cCNC(C)CN', 'cCCCCCN',
         'cCCCC(C)C', 'CC(C)CC(C)N', 'ccc(c)Cnc', 'cc(C)c(C)cO', 'Ccc(C)cOC', 'CCCNCCO', 'ccNC(C)CC', 'CNCCCCO', 'CC(CO)CCN', 'cc(c)C(C)CO',
         'CCC(C)C(C)c', 'cc1cOCC1', 'cc(O)c(c)CC', 'cc(C)cC(C)C', 'CccC(C)CO', 'CCc(cC)cO', 'cc(CC)cOC', 'cCC(N)C(C)C', 'cc(C)cCCO',
         'cC(CO)CCN', 'CCC(C)ccO', 'CCC(CN)CN', 'cc(c)N(c)CC', 'ccc(S)c(c)N', 'cc(S)c(c)NC', 'CCCCNCN', 'CC(C)CNCN', 'C[X]', 'c[X]',
         'E_1', 'E_2', 'E_3', 'E_4', 'E_5', 'E_6', 'E_7', 'E_8']]

In [93]:
df.head()

Unnamed: 0,SMILES,Class,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,...,C[X],c[X],E_1,E_2,E_3,E_4,E_5,E_6,E_7,E_8
0,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,0,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,...,0,0,0.0,0.0,52.176914,0.0,0.0,0.0,-6.391944,-1.451636
1,COC1=CC2=C(C3=CC=C(OC)C(=O)C=C3C(NC(C)=O)CC2)C...,0,399.44,2.87,6.0,1.0,83.09,5.0,0.0,2.87,...,0,0,21.995594,24.480585,2.964085,3.01027,1.604915,6.569938,1.268552,1.464264
2,C#CCO,1,56.06,-0.39,1.0,1.0,20.23,0.0,1.0,-0.39,...,0,0,0.0,0.0,7.638889,0.0,1.986111,0.0,4.527778,-0.152778
3,CCO,1,46.07,-0.0,1.0,1.0,20.23,0.0,0.0,-0.0,...,0,0,0.0,0.0,7.569444,0.0,0.0,0.0,0.0,1.930556
4,O=C1C(C2=CC=C(O)C=C2)=COC2=CC(O)=CC=C12,1,254.24,2.87,4.0,2.0,70.67,1.0,0.0,2.87,...,0,0,5.366593,12.320988,18.99992,1.249243,0.186928,10.685367,1.357627,0.0


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Columns: 425 entries, SMILES to E_8
dtypes: float64(80), int64(344), object(1)
memory usage: 166.1+ KB


In [95]:
regression_model = XGBRegressor()
regression_model.load_model('best_regression_model.json')

In [96]:
X = df.iloc[:, 1:].values

In [97]:
X = regression_scaler.transform(X)

In [98]:
y_pred = regression_model.predict(X)

In [99]:
y_pred = [round(y, 2) for y in y_pred]


In [100]:
df['LogBB_value'] = y_pred

In [101]:
df[['SMILES', 'LogBB_value']].to_csv('LogBB_prediction.csv', index=False)