In [1]:
#insert all packages needed
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#import the smiles list
file_name = r"C:\Users\20202254\Desktop\Vakken\Advanced programming\tested_molecules-1.csv"
with open(file_name, "r") as ins:
    smiles = []
    for line in ins:
        smiles.append(line.split('\n')[0])
print('# of SMILES:', len(smiles))

# of SMILES: 1001


In [3]:
#split the csv file into multiple columns
#read the dataframe
df_molecules = pd.read_csv(file_name)

#split the dataframe
new_columns = df_molecules['SMILES;ALDH1_inhibition'].str.split(';', expand=True)
df_molecules = pd.concat([df_molecules, new_columns], axis=1)

#drop the original column
df_molecules = df_molecules.drop('SMILES;ALDH1_inhibition', axis=1)

#rename the column names 
df_molecules = df_molecules.rename(columns={0:'SMILES',1:'ALDH1_inhibition'})

df_molecules

Unnamed: 0,SMILES,ALDH1_inhibition
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1
...,...,...
995,COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1,0
996,CCNc1oc(COc2cccc(C)c2)nc1C#N,0
997,NC(=O)Cn1cnc(-c2ccccc2)c1,0
998,Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...,0


In [4]:
#since the list is very large, to try stuff out, we look at the first 4 smiles
only_smiles_list = df_molecules['SMILES']

smiles_tryout = only_smiles_list
smiles_tryout

0      COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...
1                 O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1
2      Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...
3                      CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1
4      CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21
                             ...                        
995               COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1
996                         CCNc1oc(COc2cccc(C)c2)nc1C#N
997                            NC(=O)Cn1cnc(-c2ccccc2)c1
998    Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...
999            O=C(Cn1nnc2c(cnn2-c2ccccc2)c1=O)NCc1cccs1
Name: SMILES, Length: 1000, dtype: object

In [5]:
#draw the four mole images
mols = [Chem.MolFromSmiles(smi) for smi in smiles_tryout]
#Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))

In [6]:
#calculate descriptor list
desc_list = [n[0] for n in Descriptors._descList]
print(len(desc_list))
print(desc_list)

209
['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'Slog

In [7]:
#calculate molecular descriptors
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)

rdkit_desc = [calc.CalcDescriptors(m) for m in mols]

#print(len(rdkit_desc[0]))
print(rdkit_desc[0])


(13.083531447323905, 13.083531447323905, 0.001173180692030762, -0.6831399723499987, 0.5203647862499531, 463.54200000000026, 434.3100000000002, 463.2331877880001, 178, 0, 0.2498683330982345, -0.4964765338733181, 0.4964765338733181, 0.2498683330982345, 1.088235294117647, 1.7941176470588236, 2.5, 16.465857064612035, 10.012387123815586, 2.277377408380586, -2.329164203915786, 2.2133733533282376, -2.5243684910679804, 5.869761700770313, -0.12818123363075157, 3.3456496356368177, 1.3746473471677294, 1110.519071976258, 23.915638315627202, 19.34719971591, 19.34719971591, 16.546045193307766, 11.32986098667514, 11.32986098667514, 8.265228811036, 8.265228811036, 5.888539678818338, 5.888539678818338, 4.187622471031995, 4.187622471031995, -3.95, 69581108.14936109, 23.21634357240783, 10.778357860560856, 5.506758970272485, 197.8337076591357, 20.270349892663187, 11.791352662431866, 0.0, 17.762698739689505, 0.0, 0.0, 9.589074368143644, 0.0, 4.681802935145185, 0.0, 41.4968842190707, 47.030966134243585, 32.

In [8]:
#add columns to dataframe
for col in range(len(desc_list)):
    column = []
    for row in range(len(rdkit_desc)):
        #the row iteration is for the molecule and the col iteration for the descriptor, this makes a list which will be
        #added to the dataframe
        descriptor = rdkit_desc[row][col]
        column.append(descriptor)
    df_molecules[desc_list[col]] = column
df_molecules = df_molecules.drop(columns=['fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea'])

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 1000)
df_molecules

Unnamed: 0,SMILES,ALDH1_inhibition,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,AvgIpc,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1,13.083531,13.083531,0.001173,-0.683140,0.520365,463.542,434.310,463.233188,178,0,0.249868,-0.496477,0.496477,0.249868,1.088235,1.794118,2.500000,16.465857,10.012387,2.277377,-2.329164,2.213373,-2.524368,5.869762,-0.128181,3.345650,1.374647,1110.519072,23.915638,19.347200,19.347200,16.546045,11.329861,11.329861,8.265229,8.265229,5.888540,5.888540,4.187622,4.187622,-3.95,6.958111e+07,23.216344,10.778358,5.506759,197.833708,20.270350,11.791353,0.000000,17.762699,0.000000,0.000000,9.589074,0.000000,4.681803,0.000000,41.496884,47.030966,32.475912,12.797184,14.325937,17.762699,0.000000,30.840832,0.000000,38.269884,25.099220,60.160755,0.0,11.436898,20.270350,5.948339,5.749512,0.000000,64.304606,16.009896,0.000000,25.328832,54.597304,0.000000,5.687386,0.0,114.27,6.041841,9.589074,0.0,17.856200,31.208186,24.092481,6.923737,11.791600,54.597304,31.058939,4.736863,7.122812,0.000000,26.972964,18.075462,1.758225,0.920982,16.565369,1.837103,2.822263,1.591488,0.375000,34,2,10,0,1,1,2,1,3,8,2,10,8,0,1,1,4,1.50330,126.8344
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1,12.170097,12.170097,0.066966,-0.066966,0.498564,378.457,360.313,378.115047,136,0,0.230353,-0.467476,0.467476,0.230353,1.111111,2.000000,2.814815,32.166556,10.202245,2.140429,-2.083839,2.235385,-2.240774,7.993662,-0.118316,3.338816,1.485090,1028.775024,18.476481,14.538559,15.355056,13.292826,8.564791,9.550390,5.957024,6.899523,4.024513,5.000911,2.736403,3.568412,-3.06,3.137659e+06,17.358644,8.092317,4.199706,160.174276,9.733940,11.276948,10.803614,5.907180,0.000000,0.000000,9.361637,9.967957,0.000000,0.000000,42.093720,29.828920,6.196844,25.105529,9.211688,28.832943,0.000000,19.851845,0.000000,18.245949,5.752854,78.381009,0.0,0.000000,5.316789,0.000000,0.000000,11.761885,26.195090,17.884050,0.000000,11.323699,76.630898,0.000000,11.163878,0.0,72.95,0.000000,4.794537,0.0,11.660033,13.089513,27.644013,11.761885,18.526374,36.398202,31.984579,4.417151,7.276561,1.406535,21.295170,3.630481,2.804024,0.940073,17.587352,3.352383,1.040755,0.000000,0.150000,27,1,6,0,0,0,1,3,4,6,1,7,7,0,0,0,4,3.48110,104.3507
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1,10.905837,10.905837,0.016881,-0.016881,0.382043,477.589,444.325,477.260865,184,0,0.219930,-0.492903,0.492903,0.219930,1.228571,2.114286,2.857143,16.492058,9.989823,2.359046,-2.317898,2.246545,-3.124535,5.824876,-0.951912,3.598320,1.509009,1263.581781,24.363232,20.515922,20.515922,16.991204,12.321269,12.321269,9.787693,9.787693,6.864949,6.864949,5.211578,5.211578,-3.30,1.436707e+08,23.442266,10.131417,5.058984,204.265757,19.160451,31.498483,6.041841,11.704393,0.000000,0.000000,0.000000,9.665781,0.000000,5.098682,32.046576,53.954703,17.911012,17.343315,19.160451,10.902925,0.000000,25.191233,5.917906,58.848175,13.151638,65.372920,0.0,5.879988,4.899910,0.000000,5.879988,0.000000,49.553366,17.826376,12.841643,55.442513,47.078516,0.000000,10.902925,0.0,103.53,0.000000,5.106527,0.0,23.943702,13.089513,59.607761,4.899910,6.263163,41.937375,40.423272,9.154014,13.395484,0.000000,5.789600,24.674971,2.688982,1.918367,11.911982,3.929994,9.023954,0.000000,0.461538,35,2,9,0,1,1,1,3,4,8,2,9,9,0,1,1,5,2.83782,129.8585
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1,11.562446,11.562446,0.270607,-0.454447,0.795948,330.609,317.505,328.981818,96,0,0.351723,-0.421732,0.421732,0.351723,1.333333,2.000000,2.611111,79.918731,10.173315,2.116608,-2.205938,2.302152,-2.211289,9.103314,0.556316,2.372515,2.658955,634.659228,13.284093,10.165903,12.507829,8.578917,5.748647,6.919609,3.870236,5.041199,2.904829,4.245926,1.914759,2.442554,-1.00,1.207136e+04,13.432099,5.325444,2.268519,118.469823,9.317061,10.055740,0.000000,0.000000,0.000000,5.625586,0.000000,4.794537,0.000000,0.000000,11.600940,41.910152,30.229490,5.022633,4.417151,44.187514,0.000000,0.000000,0.000000,13.847474,17.989423,38.114578,0.0,0.000000,10.525496,5.687386,0.000000,27.530884,13.089513,0.000000,0.000000,13.847474,31.883509,5.022633,10.969244,0.0,33.45,5.625586,4.794537,0.0,4.472720,10.605653,24.163123,0.000000,0.000000,18.199101,34.677328,16.018091,5.510142,9.236893,13.738424,1.130921,1.079478,0.000000,5.703098,0.000000,5.962156,0.000000,0.307692,18,0,3,0,0,0,1,1,2,3,0,5,3,0,0,0,2,4.05510,78.7550
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1,12.108866,12.108866,0.086947,-3.251317,0.687618,419.553,402.417,419.043204,140,0,0.231765,-0.301646,0.301646,0.231765,1.296296,2.148148,2.888889,32.233291,10.071048,2.259832,-2.158781,2.341424,-2.269383,7.916459,-0.115075,3.329276,1.497501,1088.262215,19.018297,14.506689,16.956179,12.897998,8.287692,11.895309,6.270834,10.067995,4.293941,7.107982,3.073912,5.471606,-2.23,1.948182e+06,18.148160,7.063603,3.896422,165.096839,5.316789,0.000000,5.131558,15.930471,0.000000,0.000000,9.099753,13.401776,0.000000,22.673572,12.132734,35.563437,22.365418,24.057905,13.212334,49.422987,0.000000,4.983979,0.000000,12.841643,22.422530,51.532560,0.0,11.257379,9.622005,10.818945,0.000000,22.673572,32.109481,27.659472,0.000000,10.440599,41.091961,0.000000,11.257379,0.0,79.37,10.023291,13.212334,0.0,5.907180,24.517958,27.385364,21.897771,11.336786,41.091961,10.300767,0.000000,25.127230,2.929890,17.622350,7.239143,3.417866,-0.086947,9.531703,2.249137,0.470945,-3.251317,0.222222,27,1,6,0,1,1,1,2,3,6,1,9,5,0,0,0,4,3.37490,110.0965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1,0,12.276695,12.276695,0.095308,-0.234165,0.779148,261.301,244.165,261.123369,100,0,0.292020,-0.496766,0.496766,0.292020,1.210526,1.894737,2.473684,16.465312,10.223467,2.444682,-2.140576,2.245313,-2.926238,6.213560,-0.687092,3.177923,1.780900,507.927291,13.405413,10.834731,10.834731,9.185872,6.546448,6.546448,5.105909,5.105909,3.638622,3.638622,2.425542,2.425542,-1.88,4.581294e+04,12.169089,4.660559,2.332483,111.085563,10.053652,5.749512,6.041841,5.907180,5.907180,0.000000,9.589074,4.899910,0.000000,0.000000,0.000000,24.265468,12.841643,25.259846,19.642726,17.501746,0.000000,0.000000,0.000000,31.346147,12.009707,24.265468,0.0,5.749512,14.953561,5.687386,5.749512,0.000000,31.007839,9.589074,0.000000,19.262465,24.265468,0.000000,0.000000,0.0,63.22,0.000000,9.589074,0.0,17.856200,23.899561,12.841643,4.899910,31.375266,5.316789,0.000000,4.736863,5.072969,0.000000,25.577092,2.045394,0.627312,0.502844,7.309142,2.613467,0.000000,1.585114,0.428571,19,2,5,1,1,2,1,0,1,3,1,5,4,1,1,2,3,0.05290,68.8394
996,CCNc1oc(COc2cccc(C)c2)nc1C#N,0,8.926724,8.926724,0.197944,0.197944,0.891297,257.293,242.173,257.116427,98,0,0.235725,-0.483868,0.483868,0.235725,1.473684,2.315789,3.000000,16.491677,10.220307,2.119409,-2.033672,2.185394,-1.960389,5.445859,0.264431,2.792946,2.055194,598.978812,13.664926,10.934538,10.934538,9.223877,6.057449,6.057449,4.032650,4.032650,2.466539,2.466539,1.632519,1.632519,-2.48,2.727871e+04,12.963590,5.914940,3.255460,111.383172,14.470802,11.818733,6.606882,17.468834,0.000000,0.000000,0.000000,0.000000,10.245870,0.000000,12.132734,31.543660,6.544756,0.000000,9.154014,5.884182,5.261892,4.983979,0.000000,20.454356,11.861545,41.413572,0.0,11.818733,10.053652,5.884182,5.749512,0.000000,11.528735,6.606882,18.254850,24.071841,28.682619,0.000000,0.000000,0.0,71.08,0.000000,0.000000,0.0,12.300810,18.319663,11.312963,0.000000,0.000000,44.182164,10.300767,14.415905,10.998737,0.000000,4.069517,11.874243,1.375601,1.531071,9.699211,0.000000,4.784953,0.000000,0.285714,19,1,5,0,0,0,1,1,2,5,1,5,5,0,0,0,2,2.86550,70.8927
997,NC(=O)Cn1cnc(-c2ccccc2)c1,0,10.688087,10.688087,0.166502,-0.368508,0.805927,201.229,190.141,201.090212,76,0,0.236896,-0.368122,0.368122,0.236896,1.333333,2.066667,2.733333,16.147009,10.162385,2.020008,-1.955677,2.149651,-2.169324,5.732946,-0.118077,2.665697,2.242556,461.695090,10.673362,8.128584,8.128584,7.254020,4.610143,4.610143,3.263581,3.263581,2.041210,2.041210,1.384493,1.384493,-2.10,4.126498e+03,9.454837,3.958005,2.224059,87.358574,10.300767,6.544756,0.000000,5.907180,0.000000,0.000000,4.794537,4.983979,0.000000,0.000000,30.331835,0.000000,11.760295,12.021248,4.794537,5.907180,0.000000,9.551078,5.733667,6.544756,0.000000,42.855999,0.0,11.257379,5.733667,0.000000,0.000000,0.000000,15.458258,11.339294,0.000000,0.000000,42.855999,0.000000,11.257379,0.0,60.91,0.000000,4.794537,0.0,12.451936,0.000000,11.257379,0.000000,17.091263,30.331835,4.983979,5.733667,1.671296,0.000000,14.882763,0.000000,6.957623,-0.368508,9.780417,3.409907,0.166502,0.000000,0.090909,15,2,4,0,0,0,1,1,2,3,1,4,3,0,0,0,2,1.03540,56.8574
998,Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...,0,12.603109,12.603109,0.065686,-0.447592,0.644831,392.441,372.281,392.126674,144,0,0.331704,-0.359541,0.359541,0.331704,1.296296,2.000000,2.592593,32.166556,10.351430,2.213046,-2.120548,2.251856,-2.372511,7.993867,-0.113401,3.194580,1.943427,1136.418381,19.877951,15.639117,16.455613,12.701285,8.242562,9.228160,6.395756,7.338255,4.003328,4.956037,2.709880,3.634452,-2.82,1.115768e+06,18.955877,7.063446,3.415088,157.710590,14.406983,5.760247,22.138177,5.907180,5.559267,5.689743,18.723274,9.778516,0.000000,0.000000,16.918548,20.771212,26.203552,5.752854,9.317632,34.650805,0.000000,23.841941,14.095344,31.969489,11.069642,32.664699,0.0,0.000000,16.565799,5.817863,0.000000,11.761885,35.501974,18.889881,6.923737,25.649563,25.334973,0.000000,11.163878,0.0,116.95,11.249010,14.383612,0.0,17.701874,27.898425,4.567100,23.376657,24.604876,13.847474,15.457430,4.523095,9.039125,1.182292,41.360610,6.838974,-0.225238,0.718925,1.536806,0.000000,5.550718,2.997788,0.437500,27,1,10,0,0,0,0,3,3,10,1,11,5,0,0,0,3,1.04182,101.1937


In [9]:
#put all the column names of the dataframe in a list
columns = []
for column in df_molecules:
    columns.append(column)


In [10]:
# #check the correlations between all columns and put the highest correlations in a list
# highcorr=[]
# allcorr = []
# for column1 in range(len(columns)):
#     for column2 in range(len(columns)):
#         if column1 != column2 and column1>1 and column2>1 and column2>=column1:
#             corr = df_molecules[columns[column1]].corr(df_molecules[columns[column2]])
#             #print("Correlation between ", columns[column1], " and ", columns[column2], "is: ", round(corr, 2))
#             allcorr.append(corr)
#             if corr >= 0.80 or corr <= -0.80:
#                 #all correlations of 0,9 or higher are put in a list
#                 highcorr.append([columns[column1],columns[column2],round(corr,2)])
# #print(highcorr)
# #print(len(highcorr))

In [11]:
# #put all the columns with high correlation in a list (except the first ones to have the high correlation like MolWt)
# dupe_col = []
# for i in range(len(highcorr)):
#     if highcorr[i][1] not in dupe_col:
#         dupe_col.append(highcorr[i][1])


In [12]:
# for i in dupe_col:    
#     del df_molecules[i]
    

In [13]:
# df_molecules_1 = df_molecules[df_molecules['ALDH1_inhibition']==1]
# df_molecules_0 = df_molecules[df_molecules['ALDH1_inhibition']==0]
# df_molecules_1

In [14]:
# def find_outliers(df, column):
#     df_column = df.filter([column], axis=1)
#     q1=df_column.quantile(0.25)
#     q3=df_column.quantile(0.75)
# #     q1=df[column].quantile(0.25)
# #     q3=df[column].quantile(0.75)
    
#     IQR=q3-q1
#     outliers = df[column][((df[column]<(q1-1.5*IQR)) | (df[column]>(q3+1.5*IQR)))]
# #     lower_outliers = df_column[((df[column]<(q1-1.5*IQR))]
# #     upper_outliers = df_column[(df[column]>(q3+1.5*IQR))]
# #     outliers = lower_outliers + upper_outliers                             
#     return outliers

# outliers = find_outliers(df_molecules, 'MaxAbsEStateIndex')  
# outliers

def find_outliers(df, column):
    q1=df[column].quantile(0.25)
    q3=df[column].quantile(0.75)
    IQR=q3-q1
    outliers = df[column][((df[column]<(q1-1.5*IQR)) | (df[column]>(q3+1.5*IQR)))]
    return outliers

outliers = find_outliers(df_molecules, 'MaxAbsEStateIndex')  

outliers

13     6.011299
52     9.679179
85     5.728289
98     5.396740
112    9.651783
         ...   
961    5.545360
973    9.224908
977    5.240207
983    5.522082
996    8.926724
Name: MaxAbsEStateIndex, Length: 110, dtype: float64

In [15]:
# df_outliers = pd.DataFrame.from_dict(outliers)
# list_indexes_outliers = list(df_outliers.index.values)
pd.set_option('display.max_rows', 10)
# display(df_outliers)
# list_indexes_outliers

In [16]:
def remove_outliers(df, column):
    outliers = find_outliers(df, column)
    df_outliers = pd.DataFrame.from_dict(outliers)
    list_indexes_outliers = list(df_outliers.index.values)
    df_column = df.filter([column], axis=1)
    for i in range(len(list_indexes_outliers)):
        index_to_drop = list_indexes_outliers[i]
#         df_column.drop(index_to_drop)
        df_column[column][index_to_drop] = 'NaN'
    return df_column

remove_outliers(df_molecules, 'MaxAbsEStateIndex')



Unnamed: 0,MaxAbsEStateIndex
0,13.083531
1,12.170097
2,10.905837
3,11.562446
4,12.108866
...,...
995,12.276695
996,
997,10.688087
998,12.603109


In [20]:
def calc_average(df, column):
    list_of_values = remove_outliers(df,column)[column].tolist()
#     cleaned_list = [x for x in list_of_values if x != 'NaN']
    cleaned_list = []
    for value in list_of_values:
        if str(value) != "nan":
            cleaned_list.append(value)
    added_sum = sum(cleaned_list)
    average = added_sum/len(cleaned_list)
    return average
    
calc_average(df_molecules, 'MaxAbsEStateIndex')

12.385985038530151

In [24]:
columns.remove('SMILES')
columns.remove('ALDH1_inhibition')
columns.remove('NumValenceElectrons')
columns.remove('HeavyAtomCount')
columns.remove('NHOHCount')
columns.remove('NOCount')
columns.remove('NumAliphaticCarbocycles')
columns.remove('NumAliphaticHeterocycles')
columns.remove('NumAliphaticRings')
columns.remove('NumAromaticCarbocycles')
columns.remove('NumAromaticHeterocycles')
columns.remove('NumAromaticRings')
columns.remove('NumHAcceptors')
columns.remove('NumHDonors')
columns.remove('NumHeteroatoms')
columns.remove('NumRotatableBonds')
columns.remove('NumSaturatedCarbocycles')
columns.remove('NumSaturatedHeterocycles')
columns.remove('NumSaturatedRings')
columns.remove('RingCount')


In [25]:
def run_through_all_columns(df):
    list_averages = []
    for column in columns:
        average_per_column = calc_average(df, column)
        list_average_column = [str(column), average_per_column]
        list_averages.append(list_average_column)
    return list_averages
    
run_through_all_columns(df_molecules)

[['MaxAbsEStateIndex', 12.385985038530151],
 ['MaxEStateIndex', 12.385985038530151],
 ['MinAbsEStateIndex', 0.11496654139584912],
 ['MinEStateIndex', -0.33777943675950783],
 ['qed', 0.6483454997227138],
 ['MolWt', 344.95552557672994],
 ['HeavyAtomMolWt', 325.90462888666],
 ['ExactMolWt', 344.5527256151722],
 ['NumRadicalElectrons', 0.0],
 ['MaxPartialCharge', 0.276813400210516],
 ['MinPartialCharge', -0.423354690395052],
 ['MaxAbsPartialCharge', 0.4263459249767922],
 ['MinAbsPartialCharge', 0.2745797933770294],
 ['FpDensityMorgan1', 1.1641843072927318],
 ['FpDensityMorgan2', 1.8917042065109677],
 ['FpDensityMorgan3', 2.564918484472145],
 ['BCUT2D_MWHI', 25.649814460369296],
 ['BCUT2D_MWLOW', 10.109033757413584],
 ['BCUT2D_CHGHI', 2.2258335227715667],
 ['BCUT2D_CHGLO', -2.1832723918873245],
 ['BCUT2D_LOGPHI', 2.263770468206087],
 ['BCUT2D_LOGPLOW', -2.30780176799729],
 ['BCUT2D_MRHI', 6.8757795574796665],
 ['BCUT2D_MRLOW', -0.1631441420560416],
 ['AvgIpc', 2.835966279013269],
 ['Balaban