In [1]:
#insert all packages needed
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
#import the smiles list
file_name = 'tested_molecules-1.csv'


In [3]:
#split the csv file into multiple columns
#read the dataframe
df = pd.read_csv(file_name)

#split the dataframe
new_columns = df['SMILES;ALDH1_inhibition'].str.split(';', expand=True)
df_molecules = pd.concat([df, new_columns], axis=1)

#drop the original column
df_molecules = df_molecules.drop('SMILES;ALDH1_inhibition', axis=1)

#rename the column names 
df_molecules = df_molecules.rename(columns={0:'SMILES',1:'ALDH1-inhibitor'})


df_molecules

Unnamed: 0,SMILES,ALDH1-inhibitor
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1
...,...,...
995,COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1,0
996,CCNc1oc(COc2cccc(C)c2)nc1C#N,0
997,NC(=O)Cn1cnc(-c2ccccc2)c1,0
998,Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...,0


In [4]:
#since the list is very large, to try stuff out, we look at the first 4 smiles
only_smiles_list = df_molecules['SMILES']

smiles_tryout = only_smiles_list
smiles_tryout

0      COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...
1                 O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1
2      Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...
3                      CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1
4      CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21
                             ...                        
995               COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1
996                         CCNc1oc(COc2cccc(C)c2)nc1C#N
997                            NC(=O)Cn1cnc(-c2ccccc2)c1
998    Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...
999            O=C(Cn1nnc2c(cnn2-c2ccccc2)c1=O)NCc1cccs1
Name: SMILES, Length: 1000, dtype: object

In [5]:
#draw the four mole images
mols = [Chem.MolFromSmiles(smi) for smi in smiles_tryout]
#Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))

In [6]:
#calculate descriptor list
desc_list = [n[0] for n in Descriptors._descList]

phc_desc = [i for i in desc_list if not i.startswith('fr_')]
print(len(phc_desc))
print(phc_desc)        

124
['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'Slog

In [7]:
#calculate molecular descriptors
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)

rdkit_desc1 = [calc.CalcDescriptors(m) for m in mols]
rdkit_desc_data = [calc.CalcDescriptors(m) for m in mols]

rdkit_desc = []
for element in rdkit_desc_data:   
    tuple_floats = tuple(float(x) for x in element)
    rdkit_desc.append(tuple_floats)

#print(len(rdkit_desc[0]))
print(rdkit_desc1[0])


(13.083531447323905, 13.083531447323905, 0.001173180692030762, -0.6831399723499987, 0.5203647862499531, 463.54200000000026, 434.3100000000002, 463.2331877880001, 178, 0, 0.2498683330982345, -0.4964765338733181, 0.4964765338733181, 0.2498683330982345, 1.088235294117647, 1.7941176470588236, 2.5, 16.465857064612035, 10.012387123815586, 2.277377408380586, -2.329164203915786, 2.2133733533282376, -2.5243684910679804, 5.869761700770313, -0.12818123363075157, 3.3456496356368177, 1.3746473471677294, 1110.519071976258, 23.915638315627202, 19.34719971591, 19.34719971591, 16.546045193307766, 11.32986098667514, 11.32986098667514, 8.265228811036, 8.265228811036, 5.888539678818338, 5.888539678818338, 4.187622471031995, 4.187622471031995, -3.95, 69581108.14936109, 23.21634357240783, 10.778357860560856, 5.506758970272485, 197.8337076591357, 20.270349892663187, 11.791352662431866, 0.0, 17.762698739689505, 0.0, 0.0, 9.589074368143644, 0.0, 4.681802935145185, 0.0, 41.4968842190707, 47.030966134243585, 32.

In [8]:
#add columns to dataframe
#add columns to dataframe
for col in range(len(desc_list)):
    column = []
    for row in range(len(rdkit_desc)):
        #the row iteration is for the molecule and the col iteration for the descriptor, this makes a list which will be
        #added to the dataframe
        descriptor = rdkit_desc[row][col]
        column.append(descriptor)
    df_molecules[desc_list[col]] = column
df_molecules = df_molecules.drop(columns=['fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea'])

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_molecules
 

  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column
  df_molecules[desc_list[col]] = column


Unnamed: 0,SMILES,ALDH1-inhibitor,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,AvgIpc,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,...,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1,13.083531,13.083531,0.001173,-0.683140,0.520365,463.542,434.310,463.233188,178.0,0.0,0.249868,-0.496477,0.496477,0.249868,1.088235,1.794118,2.500000,16.465857,10.012387,2.277377,-2.329164,2.213373,-2.524368,5.869762,-0.128181,3.345650,1.374647,1110.519072,23.915638,19.347200,19.347200,16.546045,11.329861,11.329861,8.265229,8.265229,5.888540,5.888540,4.187622,4.187622,-3.95,6.958111e+07,23.216344,10.778358,5.506759,197.833708,20.270350,11.791353,...,64.304606,16.009896,0.000000,25.328832,54.597304,0.000000,5.687386,0.0,114.27,6.041841,9.589074,0.0,17.856200,31.208186,24.092481,6.923737,11.791600,54.597304,31.058939,4.736863,7.122812,0.000000,26.972964,18.075462,1.758225,0.920982,16.565369,1.837103,2.822263,1.591488,0.375000,34.0,2.0,10.0,0.0,1.0,1.0,2.0,1.0,3.0,8.0,2.0,10.0,8.0,0.0,1.0,1.0,4.0,1.50330,126.8344
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1,12.170097,12.170097,0.066966,-0.066966,0.498564,378.457,360.313,378.115047,136.0,0.0,0.230353,-0.467476,0.467476,0.230353,1.111111,2.000000,2.814815,32.166556,10.202245,2.140429,-2.083839,2.235385,-2.240774,7.993662,-0.118316,3.338816,1.485090,1028.775024,18.476481,14.538559,15.355056,13.292826,8.564791,9.550390,5.957024,6.899523,4.024513,5.000911,2.736403,3.568412,-3.06,3.137659e+06,17.358644,8.092317,4.199706,160.174276,9.733940,11.276948,...,26.195090,17.884050,0.000000,11.323699,76.630898,0.000000,11.163878,0.0,72.95,0.000000,4.794537,0.0,11.660033,13.089513,27.644013,11.761885,18.526374,36.398202,31.984579,4.417151,7.276561,1.406535,21.295170,3.630481,2.804024,0.940073,17.587352,3.352383,1.040755,0.000000,0.150000,27.0,1.0,6.0,0.0,0.0,0.0,1.0,3.0,4.0,6.0,1.0,7.0,7.0,0.0,0.0,0.0,4.0,3.48110,104.3507
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1,10.905837,10.905837,0.016881,-0.016881,0.382043,477.589,444.325,477.260865,184.0,0.0,0.219930,-0.492903,0.492903,0.219930,1.228571,2.114286,2.857143,16.492058,9.989823,2.359046,-2.317898,2.246545,-3.124535,5.824876,-0.951912,3.598320,1.509009,1263.581781,24.363232,20.515922,20.515922,16.991204,12.321269,12.321269,9.787693,9.787693,6.864949,6.864949,5.211578,5.211578,-3.30,1.436707e+08,23.442266,10.131417,5.058984,204.265757,19.160451,31.498483,...,49.553366,17.826376,12.841643,55.442513,47.078516,0.000000,10.902925,0.0,103.53,0.000000,5.106527,0.0,23.943702,13.089513,59.607761,4.899910,6.263163,41.937375,40.423272,9.154014,13.395484,0.000000,5.789600,24.674971,2.688982,1.918367,11.911982,3.929994,9.023954,0.000000,0.461538,35.0,2.0,9.0,0.0,1.0,1.0,1.0,3.0,4.0,8.0,2.0,9.0,9.0,0.0,1.0,1.0,5.0,2.83782,129.8585
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1,11.562446,11.562446,0.270607,-0.454447,0.795948,330.609,317.505,328.981818,96.0,0.0,0.351723,-0.421732,0.421732,0.351723,1.333333,2.000000,2.611111,79.918731,10.173315,2.116608,-2.205938,2.302152,-2.211289,9.103314,0.556316,2.372515,2.658955,634.659228,13.284093,10.165903,12.507829,8.578917,5.748647,6.919609,3.870236,5.041199,2.904829,4.245926,1.914759,2.442554,-1.00,1.207136e+04,13.432099,5.325444,2.268519,118.469823,9.317061,10.055740,...,13.089513,0.000000,0.000000,13.847474,31.883509,5.022633,10.969244,0.0,33.45,5.625586,4.794537,0.0,4.472720,10.605653,24.163123,0.000000,0.000000,18.199101,34.677328,16.018091,5.510142,9.236893,13.738424,1.130921,1.079478,0.000000,5.703098,0.000000,5.962156,0.000000,0.307692,18.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,2.0,3.0,0.0,5.0,3.0,0.0,0.0,0.0,2.0,4.05510,78.7550
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1,12.108866,12.108866,0.086947,-3.251317,0.687618,419.553,402.417,419.043204,140.0,0.0,0.231765,-0.301646,0.301646,0.231765,1.296296,2.148148,2.888889,32.233291,10.071048,2.259832,-2.158781,2.341424,-2.269383,7.916459,-0.115075,3.329276,1.497501,1088.262215,19.018297,14.506689,16.956179,12.897998,8.287692,11.895309,6.270834,10.067995,4.293941,7.107982,3.073912,5.471606,-2.23,1.948182e+06,18.148160,7.063603,3.896422,165.096839,5.316789,0.000000,...,32.109481,27.659472,0.000000,10.440599,41.091961,0.000000,11.257379,0.0,79.37,10.023291,13.212334,0.0,5.907180,24.517958,27.385364,21.897771,11.336786,41.091961,10.300767,0.000000,25.127230,2.929890,17.622350,7.239143,3.417866,-0.086947,9.531703,2.249137,0.470945,-3.251317,0.222222,27.0,1.0,6.0,0.0,1.0,1.0,1.0,2.0,3.0,6.0,1.0,9.0,5.0,0.0,0.0,0.0,4.0,3.37490,110.0965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1,0,12.276695,12.276695,0.095308,-0.234165,0.779148,261.301,244.165,261.123369,100.0,0.0,0.292020,-0.496766,0.496766,0.292020,1.210526,1.894737,2.473684,16.465312,10.223467,2.444682,-2.140576,2.245313,-2.926238,6.213560,-0.687092,3.177923,1.780900,507.927291,13.405413,10.834731,10.834731,9.185872,6.546448,6.546448,5.105909,5.105909,3.638622,3.638622,2.425542,2.425542,-1.88,4.581294e+04,12.169089,4.660559,2.332483,111.085563,10.053652,5.749512,...,31.007839,9.589074,0.000000,19.262465,24.265468,0.000000,0.000000,0.0,63.22,0.000000,9.589074,0.0,17.856200,23.899561,12.841643,4.899910,31.375266,5.316789,0.000000,4.736863,5.072969,0.000000,25.577092,2.045394,0.627312,0.502844,7.309142,2.613467,0.000000,1.585114,0.428571,19.0,2.0,5.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,1.0,5.0,4.0,1.0,1.0,2.0,3.0,0.05290,68.8394
996,CCNc1oc(COc2cccc(C)c2)nc1C#N,0,8.926724,8.926724,0.197944,0.197944,0.891297,257.293,242.173,257.116427,98.0,0.0,0.235725,-0.483868,0.483868,0.235725,1.473684,2.315789,3.000000,16.491677,10.220307,2.119409,-2.033672,2.185394,-1.960389,5.445859,0.264431,2.792946,2.055194,598.978812,13.664926,10.934538,10.934538,9.223877,6.057449,6.057449,4.032650,4.032650,2.466539,2.466539,1.632519,1.632519,-2.48,2.727871e+04,12.963590,5.914940,3.255460,111.383172,14.470802,11.818733,...,11.528735,6.606882,18.254850,24.071841,28.682619,0.000000,0.000000,0.0,71.08,0.000000,0.000000,0.0,12.300810,18.319663,11.312963,0.000000,0.000000,44.182164,10.300767,14.415905,10.998737,0.000000,4.069517,11.874243,1.375601,1.531071,9.699211,0.000000,4.784953,0.000000,0.285714,19.0,1.0,5.0,0.0,0.0,0.0,1.0,1.0,2.0,5.0,1.0,5.0,5.0,0.0,0.0,0.0,2.0,2.86550,70.8927
997,NC(=O)Cn1cnc(-c2ccccc2)c1,0,10.688087,10.688087,0.166502,-0.368508,0.805927,201.229,190.141,201.090212,76.0,0.0,0.236896,-0.368122,0.368122,0.236896,1.333333,2.066667,2.733333,16.147009,10.162385,2.020008,-1.955677,2.149651,-2.169324,5.732946,-0.118077,2.665697,2.242556,461.695090,10.673362,8.128584,8.128584,7.254020,4.610143,4.610143,3.263581,3.263581,2.041210,2.041210,1.384493,1.384493,-2.10,4.126498e+03,9.454837,3.958005,2.224059,87.358574,10.300767,6.544756,...,15.458258,11.339294,0.000000,0.000000,42.855999,0.000000,11.257379,0.0,60.91,0.000000,4.794537,0.0,12.451936,0.000000,11.257379,0.000000,17.091263,30.331835,4.983979,5.733667,1.671296,0.000000,14.882763,0.000000,6.957623,-0.368508,9.780417,3.409907,0.166502,0.000000,0.090909,15.0,2.0,4.0,0.0,0.0,0.0,1.0,1.0,2.0,3.0,1.0,4.0,3.0,0.0,0.0,0.0,2.0,1.03540,56.8574
998,Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...,0,12.603109,12.603109,0.065686,-0.447592,0.644831,392.441,372.281,392.126674,144.0,0.0,0.331704,-0.359541,0.359541,0.331704,1.296296,2.000000,2.592593,32.166556,10.351430,2.213046,-2.120548,2.251856,-2.372511,7.993867,-0.113401,3.194580,1.943427,1136.418381,19.877951,15.639117,16.455613,12.701285,8.242562,9.228160,6.395756,7.338255,4.003328,4.956037,2.709880,3.634452,-2.82,1.115768e+06,18.955877,7.063446,3.415088,157.710590,14.406983,5.760247,...,35.501974,18.889881,6.923737,25.649563,25.334973,0.000000,11.163878,0.0,116.95,11.249010,14.383612,0.0,17.701874,27.898425,4.567100,23.376657,24.604876,13.847474,15.457430,4.523095,9.039125,1.182292,41.360610,6.838974,-0.225238,0.718925,1.536806,0.000000,5.550718,2.997788,0.437500,27.0,1.0,10.0,0.0,0.0,0.0,0.0,3.0,3.0,10.0,1.0,11.0,5.0,0.0,0.0,0.0,3.0,1.04182,101.1937


In [9]:
#put all the column names of the dataframe in a list
columns = []
for column in df_molecules:
    columns.append(column)

In [10]:
#check the correlations between all columns and put the highest correlations in a list
highcorr=[]
allcorr = []
for column1 in range(len(columns)):
    for column2 in range(len(columns)):
        if column1 != column2 and column1>1 and column2>1 and column2>=column1:
            corr = df_molecules[columns[column1]].corr(df_molecules[columns[column2]])
            #print("Correlation between ", columns[column1], " and ", columns[column2], "is: ", round(corr, 2))
            allcorr.append(corr)
            if corr >= 0.80 or corr <= -0.80:
                #all correlations of 0,9 or higher are put in a list
                highcorr.append([columns[column1],columns[column2],round(corr,2)])
print(highcorr)
print(len(highcorr))

[['MaxAbsEStateIndex', 'MaxEStateIndex', 1.0], ['MinEStateIndex', 'VSA_EState1', -0.83], ['MolWt', 'HeavyAtomMolWt', 1.0], ['MolWt', 'ExactMolWt', 1.0], ['MolWt', 'NumValenceElectrons', 0.96], ['MolWt', 'BertzCT', 0.84], ['MolWt', 'Chi0', 0.97], ['MolWt', 'Chi0n', 0.92], ['MolWt', 'Chi0v', 0.97], ['MolWt', 'Chi1', 0.95], ['MolWt', 'Chi1n', 0.89], ['MolWt', 'Chi1v', 0.93], ['MolWt', 'Chi2n', 0.81], ['MolWt', 'Chi2v', 0.86], ['MolWt', 'Chi3v', 0.8], ['MolWt', 'Kappa1', 0.95], ['MolWt', 'Kappa2', 0.83], ['MolWt', 'LabuteASA', 0.98], ['MolWt', 'HeavyAtomCount', 0.96], ['MolWt', 'MolMR', 0.95], ['HeavyAtomMolWt', 'ExactMolWt', 1.0], ['HeavyAtomMolWt', 'NumValenceElectrons', 0.95], ['HeavyAtomMolWt', 'BertzCT', 0.85], ['HeavyAtomMolWt', 'Chi0', 0.96], ['HeavyAtomMolWt', 'Chi0n', 0.9], ['HeavyAtomMolWt', 'Chi0v', 0.95], ['HeavyAtomMolWt', 'Chi1', 0.94], ['HeavyAtomMolWt', 'Chi1n', 0.86], ['HeavyAtomMolWt', 'Chi1v', 0.92], ['HeavyAtomMolWt', 'Chi2v', 0.84], ['HeavyAtomMolWt', 'Kappa1', 0.94], 

In [11]:
#put all the columns with high correlation in a list (except the first ones to have the high correlation like MolWt)
dupe_col = []
for i in range(len(highcorr)):
    if highcorr[i][1] not in dupe_col:
        dupe_col.append(highcorr[i][1])
print(len(dupe_col))

40


In [12]:
descriptors = ['MaxAbsEStateIndex', 'MinAbsEStateIndex',
       'MinEStateIndex', 'qed', 'MolWt', 'NumRadicalElectrons',
       'MaxPartialCharge', 'MinPartialCharge', 'FpDensityMorgan1',
       'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
       'AvgIpc', 'BalabanJ', 'HallKierAlpha', 'Ipc', 'PEOE_VSA1', 'PEOE_VSA10',
       'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
       'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2',
       'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8',
       'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12',
       'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA7', 'SlogP_VSA8',
       'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11',
       'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5',
       'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
       'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5',
       'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'FractionCSP3',
       'NHOHCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
       'NumAliphaticRings', 'NumAromaticHeterocycles', 'NumAromaticRings',
       'NumSaturatedHeterocycles', 'RingCount','MolLogP']
for i in dupe_col:    
    df_molecules = df_molecules.drop(columns = [i])
print(df_molecules)

columns_new = []
for descriptor in df_molecules:
    columns_new.append(descriptor)
print(columns_new)

                                                SMILES ALDH1-inhibitor  \
0    COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...               1   
1               O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1               1   
2    Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...               1   
3                    CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1               1   
4    CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21               1   
..                                                 ...             ...   
995             COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1               0   
996                       CCNc1oc(COc2cccc(C)c2)nc1C#N               0   
997                          NC(=O)Cn1cnc(-c2ccccc2)c1               0   
998  Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...               0   
999          O=C(Cn1nnc2c(cnn2-c2ccccc2)c1=O)NCc1cccs1               0   

     MaxAbsEStateIndex  MinAbsEStateIndex  MinEStateIndex       qed    MolWt  \
0            13.083531         

# Remove Outliers

In [13]:
def find_outliers(df, column):
    q1=df[column].quantile(0.25)
    q3=df[column].quantile(0.75)
    IQR=q3-q1
    lower_outliers = df[column][(df[column]<(q1-IQR))]
    upper_outliers = df[column][(df[column]>(q3+IQR))]
    frames = [lower_outliers, upper_outliers]
    outliers = pd.concat(frames)
    print(outliers)
    return outliers

#outliers = find_outliers(df_molecules_1, 'MaxAbsEStateIndex')  
#outliers

In [14]:
descriptors = ['MaxAbsEStateIndex', 'MinAbsEStateIndex',
       'MinEStateIndex', 'qed', 'MolWt', 'NumRadicalElectrons',
       'MaxPartialCharge', 'MinPartialCharge', 'FpDensityMorgan1',
       'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
       'AvgIpc', 'BalabanJ', 'HallKierAlpha', 'Ipc', 'PEOE_VSA1', 'PEOE_VSA10',
       'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
       'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2',
       'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8',
       'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12',
       'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA7', 'SlogP_VSA8',
       'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11',
       'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5',
       'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
       'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5',
       'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'FractionCSP3',
       'NHOHCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
       'NumAliphaticRings', 'NumAromaticHeterocycles', 'NumAromaticRings',
       'NumSaturatedHeterocycles', 'RingCount','MolLogP']
def remove_outliers(df, column):
    outliers = find_outliers(df, column)
    df_outliers = pd.DataFrame.from_dict(outliers)
    list_indexes_outliers = list(df_outliers.index.values)
    df_column = df.filter([column], axis=1)
    for i in range(len(list_indexes_outliers)):
        index_to_drop = list_indexes_outliers[i]
#         df_column.drop(index_to_drop)
        df_column[column][index_to_drop] = 'NaN'
    return df_column

pd.set_option('display.max_rows', 1000)
for descriptor in descriptors:
    df_molecules[descriptor]=remove_outliers(df_molecules, descriptor)
    

13      6.011299
52      9.679179
85      5.728289
93     10.383623
98      5.396740
112     9.651783
128     9.345715
129     4.523711
133     4.354542
135     5.091186
142    10.372195
144    10.082781
160     5.338834
198     5.359724
202     5.419591
216     5.807513
220     5.322822
252     4.541634
268     5.558917
277     9.389074
288     5.268563
289     5.116451
290     8.400719
297    10.428982
301     5.363287
302     4.118715
303    10.466205
305    10.401183
306    10.336111
311     9.548955
312    10.159860
316     2.502315
317     9.415264
320     9.487593
323     8.391575
326    10.255484
328     8.706713
329     5.229316
332     5.510613
334     5.510603
337     5.281053
341     6.051410
347     5.933225
357     5.964657
362     6.059950
378     5.837037
404     9.347802
419    10.202779
434     5.557464
438     5.777637
439     6.142864
445     5.537205
459     9.394551
461     6.147929
464     5.441153
465     9.679852
469     5.604206
478    10.434480
481     5.4783

6      12.083478
26     12.124355
251    23.877221
294    12.062545
299    12.062545
319    24.011646
321    12.062545
346    12.186480
379    17.981224
380    12.000420
381    18.611925
386    12.580810
521    12.000420
533    12.062545
538    16.910451
574    12.186480
676    12.000420
691    17.251130
716    12.347974
729    12.124355
734    17.808464
772    17.069230
831    12.124355
843    23.427429
859    12.186480
865    12.062545
938    12.103705
953    12.103705
964    18.031534
988    12.103705
993    24.373527
Name: PEOE_VSA14, dtype: float64
28     23.517811
41     18.950711
75     19.283521
89     18.950711
99     19.475955
105    25.760438
121    25.757446
141    39.913620
143    29.397840
145    19.283521
154    19.808765
156    19.283521
158    20.685574
161    25.023174
177    23.201880
187    19.703393
207    18.950711
208    21.950130
211    19.170787
212    20.007537
231    20.228637
232    29.292467
238    19.703393
241    30.336970
251    23.972686
259    25.02317

9       5.261892
12      5.261892
36      5.261892
45     10.523783
52     10.523783
107     5.261892
109     5.261892
128     5.261892
131     5.261892
140     5.261892
149     5.261892
205     5.261892
224     5.261892
225     5.261892
290     5.261892
323     5.261892
328     5.261892
339     5.261892
342     5.261892
343     5.261892
352     5.409284
353     5.261892
390     5.261892
410     5.261892
522     5.261892
523     5.261892
565    10.523783
581     5.261892
602     5.261892
614     5.261892
618    10.523783
647     5.261892
653     5.261892
700     5.261892
736     5.261892
749     5.261892
772    10.523783
775     5.261892
863     5.261892
888     5.261892
899     5.261892
907     5.261892
913     5.261892
921     5.261892
925     5.261892
943    15.785675
948     5.409284
954     5.261892
973     5.261892
996     5.261892
Name: SMR_VSA2, dtype: float64
0      30.840832
2      25.191233
5      24.512471
14     29.931018
16     29.931018
35     29.716399
62     25.107165


39     32.675606
46     27.619031
77     23.684315
88     25.597761
90     23.611718
105    35.419753
157    25.874685
167    28.555676
198    31.165325
200    27.938743
220    26.428462
222    26.428462
276    32.675606
289    29.172610
291    38.198564
325    23.785040
334    24.425661
338    32.675606
345    23.818813
358    27.938743
367    27.938743
388    25.811529
494    44.893479
497    27.938743
530    24.053979
556    51.140623
570    24.229729
631    27.938743
639    29.552481
643    25.811529
673    34.802820
694    28.185858
818    23.364603
831    27.938743
880    27.055449
896    26.428462
953    32.675606
965    27.724975
988    37.513194
Name: EState_VSA9, dtype: float64
26     46.555306
28     51.422721
75     41.415712
87     45.081141
89     42.932622
141    55.126673
143    51.474491
156    54.081729
173    47.250321
192    48.909029
208    63.636560
214    43.388932
232    57.172417
241    44.931670
251    58.793126
259    46.009793
264    43.703637
282    59.7404

4     -3.251317
5     -3.540253
15    -3.594898
17    -4.006749
25    -2.190141
27    -3.664604
32    -3.740406
38    -3.836039
39    -3.747172
40    -1.785816
50    -3.784519
53    -3.696685
54    -3.509259
56    -2.343597
62    -2.419766
69    -3.472980
71    -3.628322
79    -2.150868
92    -5.818713
100   -2.685423
101   -3.458795
116   -3.577749
117   -3.535194
137   -4.112639
138   -3.769743
157   -3.126599
168   -1.783759
175   -7.679243
176   -3.891146
183   -3.450530
184   -7.786946
188   -3.999725
193   -1.700100
210   -7.847588
217   -7.650251
226   -4.027726
240   -3.839071
243   -3.684578
263   -2.233541
266   -2.260282
269   -3.559521
270   -3.685942
273   -7.835871
274   -3.619548
275   -2.756573
292   -3.742701
313   -1.703093
330   -3.126008
349   -3.486096
350   -3.860803
352   -3.551819
354   -2.768836
361   -3.814401
374   -3.085919
377   -3.052072
392   -3.949884
403   -7.766624
416   -2.609666
422   -3.824526
427   -3.616296
447   -3.665936
448   -3.160253
477   -3

In [15]:
df_molecules


Unnamed: 0,SMILES,ALDH1-inhibitor,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,AvgIpc,BalabanJ,HallKierAlpha,Ipc,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,NumAromaticRings,NumSaturatedHeterocycles,RingCount,MolLogP
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1,13.083531,0.001173,-0.68314,0.520365,463.542,0.0,0.249868,-0.496477,1.088235,16.465857,10.012387,2.277377,-2.329164,2.213373,-2.524368,5.869762,-0.128181,3.34565,1.374647,-3.95,,20.27035,11.791353,0.0,,0.0,0.0,9.589074,0.0,4.681803,0.0,41.496884,47.030966,32.475912,12.797184,14.325937,17.762699,0.0,,0.0,38.269884,25.09922,60.160755,0.0,11.436898,20.27035,5.948339,5.749512,0.0,,16.009896,0.0,0.0,5.687386,0.0,114.27,6.041841,9.589074,0.0,17.8562,31.208186,24.092481,6.923737,11.7916,54.597304,31.058939,4.736863,26.972964,18.075462,1.758225,0.920982,1.837103,2.822263,1.591488,0.375,2.0,0.0,1.0,1.0,1.0,3.0,1.0,4.0,1.5033
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1,12.170097,0.066966,-0.066966,0.498564,378.457,0.0,0.230353,-0.467476,1.111111,32.166556,10.202245,2.140429,-2.083839,2.235385,-2.240774,7.993662,-0.118316,3.338816,1.48509,-3.06,3137659.249845,9.73394,11.276948,10.803614,5.90718,0.0,0.0,9.361637,9.967957,0.0,0.0,42.09372,29.82892,6.196844,25.105529,9.211688,28.832943,0.0,19.851845,0.0,18.245949,5.752854,78.381009,0.0,0.0,5.316789,0.0,0.0,11.761885,26.19509,17.88405,0.0,0.0,11.163878,0.0,72.95,0.0,4.794537,0.0,11.660033,13.089513,27.644013,11.761885,18.526374,36.398202,31.984579,4.417151,21.29517,3.630481,2.804024,0.940073,3.352383,1.040755,0.0,0.15,1.0,0.0,0.0,0.0,,4.0,0.0,4.0,3.4811
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1,10.905837,0.016881,-0.016881,0.382043,477.589,0.0,0.21993,-0.492903,1.228571,16.492058,9.989823,2.359046,-2.317898,2.246545,,5.824876,,,1.509009,-3.3,,19.160451,,6.041841,11.704393,0.0,0.0,0.0,9.665781,0.0,5.098682,32.046576,53.954703,17.911012,17.343315,19.160451,10.902925,0.0,,5.917906,,13.151638,65.37292,0.0,5.879988,4.89991,0.0,5.879988,0.0,49.553366,17.826376,12.841643,0.0,10.902925,0.0,103.53,0.0,5.106527,0.0,23.943702,13.089513,,4.89991,6.263163,41.937375,,9.154014,5.7896,,2.688982,1.918367,3.929994,,0.0,0.461538,2.0,0.0,1.0,1.0,,4.0,1.0,5.0,2.83782
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1,11.562446,0.270607,-0.454447,0.795948,330.609,0.0,0.351723,-0.421732,1.333333,,10.173315,2.116608,-2.205938,2.302152,-2.211289,9.103314,,2.372515,,-1.0,12071.356717,9.317061,10.05574,0.0,0.0,0.0,5.625586,0.0,4.794537,0.0,0.0,11.60094,41.910152,30.22949,5.022633,4.417151,,0.0,0.0,0.0,13.847474,17.989423,38.114578,0.0,0.0,10.525496,5.687386,0.0,,13.089513,0.0,0.0,,10.969244,0.0,33.45,5.625586,4.794537,0.0,4.47272,10.605653,24.163123,0.0,0.0,18.199101,34.677328,16.018091,13.738424,1.130921,1.079478,0.0,0.0,5.962156,0.0,0.307692,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,4.0551
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1,12.108866,0.086947,,0.687618,419.553,0.0,0.231765,-0.301646,1.296296,32.233291,10.071048,2.259832,-2.158781,2.341424,-2.269383,7.916459,-0.115075,3.329276,1.497501,-2.23,1948182.470883,5.316789,0.0,5.131558,,0.0,0.0,9.099753,13.401776,0.0,,12.132734,35.563437,22.365418,24.057905,13.212334,,0.0,4.983979,0.0,12.841643,22.42253,51.53256,0.0,11.257379,9.622005,10.818945,0.0,22.673572,32.109481,,0.0,0.0,11.257379,0.0,79.37,10.023291,13.212334,0.0,5.90718,24.517958,27.385364,21.897771,11.336786,41.091961,10.300767,0.0,17.62235,7.239143,3.417866,-0.086947,2.249137,0.470945,,0.222222,1.0,0.0,1.0,1.0,2.0,3.0,0.0,4.0,3.3749
5,CC1CCCCN1S(=O)(=O)c1ccc(NC(=O)c2cccc(-n3cnnn3)...,1,12.922752,0.004081,,0.671116,426.502,0.0,0.255237,-0.321996,1.133333,32.233428,10.075367,2.3032,-2.318138,2.256505,-2.413883,7.887581,0.102201,3.293492,1.485109,-3.11,,5.316789,6.32732,0.0,10.023291,5.90718,0.0,4.794537,13.0996,4.305216,5.098682,12.487189,,23.837435,10.58287,13.212334,21.617857,0.0,,0.0,37.123526,11.861545,60.421708,0.0,5.687386,5.316789,5.687386,0.0,0.0,51.424045,10.023291,0.0,0.0,5.687386,0.0,110.08,10.023291,13.212334,0.0,16.844504,23.48298,19.262465,23.141857,40.703418,6.923737,20.842241,0.0,12.840953,13.77278,1.601899,-0.314644,4.240751,2.48057,,0.3,1.0,0.0,1.0,1.0,1.0,3.0,1.0,4.0,2.4777
6,COCCN(C(=O)C(F)(F)F)C(C(=O)NC1CCCCC1)c1ccco1,1,13.056183,0.021285,,0.7944,376.375,0.0,,-0.466579,1.346154,19.413382,9.948701,2.469655,-2.317543,2.283504,-2.508217,5.900644,-0.189318,3.114639,2.269832,-2.03,634775.899915,19.370712,5.760247,6.041841,0.0,5.90718,,9.589074,0.0,,0.0,19.262465,24.974377,19.696395,12.870045,,11.814359,0.0,10.216698,0.0,50.364088,20.261436,24.156145,0.0,0.0,5.316789,,0.0,0.0,49.193844,14.325937,0.0,0.0,0.0,0.0,71.78,,22.76032,0.0,18.40897,4.89991,32.104108,25.505695,0.0,0.0,5.316789,9.154014,25.159167,2.769466,0.0,,0.646106,-0.533697,1.305796,0.647059,1.0,,0.0,1.0,1.0,1.0,0.0,2.0,2.8069
7,COc1ccccc1N1CCN(C(=O)CSc2nnc(-c3ccoc3C)o2)CC1,1,12.584322,0.063813,0.063813,0.569121,414.487,0.0,0.276863,-0.494586,1.172414,32.166567,10.197434,2.233153,-2.361292,2.289213,-2.489812,7.992619,-0.128157,3.403195,1.293545,-2.94,,23.370984,11.509759,0.0,5.90718,11.11348,0.0,4.794537,0.0,0.0,,23.894619,25.122838,26.179026,30.376652,18.365702,23.356451,0.0,15.097273,0.0,12.146493,,42.355246,0.0,17.203687,9.636773,5.687386,5.749512,11.761885,60.04613,4.794537,6.923737,0.0,11.454175,0.0,84.84,0.0,4.794537,0.0,11.660033,29.96324,30.089862,11.761885,19.439328,36.089115,15.097273,13.571165,16.699108,8.418292,1.826428,,1.57925,4.698942,1.673263,0.35,0.0,0.0,1.0,1.0,2.0,3.0,1.0,4.0,3.08752
8,O=c1c2ccccc2nc(SCc2ccccc2)n1Cc1ccco1,1,12.954808,0.043924,-0.043924,0.397779,348.427,0.0,0.261959,-0.46745,0.96,32.166555,10.224715,2.127083,-2.11785,2.224782,-2.233738,7.979381,0.476379,3.166904,1.75553,-2.79,1138679.266727,4.417151,5.760247,5.156436,0.0,5.559267,0.0,9.361637,4.983979,0.0,0.0,,29.82892,5.752854,23.710844,4.417151,22.66481,0.0,9.551078,0.0,17.454046,0.0,,0.0,0.0,5.559267,0.0,0.0,11.761885,9.551078,12.29761,0.0,0.0,10.902925,0.0,48.03,0.0,4.794537,0.0,5.559267,17.087417,17.029802,5.563451,22.592148,54.597304,12.132734,9.401129,17.670677,1.321361,1.871087,1.490543,1.616697,0.375784,0.0,0.1,0.0,0.0,0.0,0.0,2.0,4.0,0.0,4.0,4.3301
9,N#Cc1c(NC(=O)CSc2nnnn2C2CCCC2)sc2c1CCCC2,1,12.408686,0.119381,-0.119381,0.789317,388.522,0.0,0.23499,-0.315858,1.307692,32.1666,9.959128,2.236177,-2.102287,2.374992,-2.151675,7.993683,-0.113344,3.378085,1.402493,-1.92,1838029.429434,5.316789,11.070303,0.0,11.063616,0.0,0.0,4.794537,4.681803,5.261892,,24.603528,54.515151,4.877147,17.358146,4.794537,34.006933,,20.207255,0.0,,11.069642,16.00405,0.0,6.069221,5.316789,5.001082,0.0,23.098671,31.867288,17.63618,11.331113,0.0,0.0,0.0,96.49,0.0,10.056429,0.0,11.660033,21.762811,44.088381,29.480675,11.336786,4.681803,26.911462,0.0,13.659547,,1.787702,0.117255,,0.0,0.0,0.588235,1.0,,0.0,2.0,2.0,2.0,0.0,4.0,3.33098


# Data scaling

In [16]:
#MinMax scaling because the data is not gaussian distributed
scaler = MinMaxScaler()

#dataframe to use
scaled_df = df_molecules
for column in descriptors:
    scaled_df[column] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df_molecules[column])),columns=[column])
scaled_df

Unnamed: 0,SMILES,ALDH1-inhibitor,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,AvgIpc,BalabanJ,HallKierAlpha,Ipc,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,NumAromaticRings,NumSaturatedHeterocycles,RingCount,MolLogP
0,COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...,1,0.733731,0.003397,0.428308,0.38294,0.872736,0.0,0.376004,0.21788,0.405229,0.084394,0.331306,0.588774,0.257895,0.354532,0.284041,0.233313,0.484639,0.840369,0.164616,0.015773,,0.824322,0.505809,0.0,,0.0,0.0,0.625807,0.0,0.469685,0.0,0.803215,0.736686,0.844116,0.34843,0.501008,0.335708,0.0,,0.0,0.714466,0.57692,0.618077,0.0,0.513581,0.899228,0.5,0.5,0.0,,0.594346,0.0,0.0,0.260819,0.0,0.885243,0.25492,0.4,0.0,0.57977,0.766934,0.5241,0.206437,0.265784,0.954687,0.860618,0.204159,0.651664,0.895911,0.495158,0.704828,0.520432,0.502582,0.710679,0.579545,0.666667,0.0,0.5,0.5,0.5,0.666667,0.5,0.666667,0.291468
1,O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1,1,0.468918,0.195843,0.702178,0.351103,0.605001,0.0,0.296848,0.301519,0.438272,0.838092,0.689205,0.33019,0.682828,0.41628,0.616262,0.715999,0.492654,0.835729,0.23722,0.29653,0.642006,0.395844,0.483743,0.817603,0.49654,0.0,0.0,0.610964,0.592076,0.0,0.0,0.814767,0.435731,0.161069,0.68355,0.322152,0.640589,0.0,0.824202,0.0,0.340636,0.132233,0.825333,0.0,0.0,0.235862,0.0,0.0,0.5,0.41407,0.663922,0.0,0.0,0.511967,0.0,0.497844,0.0,0.2,0.0,0.378587,0.321672,0.601359,0.35069,0.417586,0.636458,0.886267,0.190379,0.514489,0.202889,0.66428,0.71012,0.684079,0.335113,0.300884,0.231818,0.333333,0.0,0.0,0.0,,1.0,0.0,0.666667,0.673688
2,Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...,1,0.102398,0.049342,0.724439,0.180946,0.916937,0.0,0.254573,0.228186,0.607937,0.085652,0.28877,0.742981,0.277408,0.447584,,0.223112,,,0.252944,0.22082,,0.779186,,0.457238,0.983837,0.0,0.0,0.0,0.574128,0.0,0.775001,0.620294,0.857819,0.465544,0.472208,0.670082,0.146786,0.0,,0.592691,,0.302298,0.677366,0.0,0.264045,0.217369,0.0,0.511347,0.0,0.783297,0.661781,0.634825,0.0,0.5,0.0,0.784549,0.0,0.213014,0.0,0.777424,0.321672,,0.146095,0.141172,0.733316,,0.394538,0.139876,,0.645676,0.981292,0.74646,,0.300884,0.713287,0.666667,0.0,0.5,0.5,,1.0,0.5,0.833333,0.549371
3,CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1,1,0.292755,0.791495,0.529955,0.785379,0.454439,0.0,0.789132,0.433449,0.759259,,0.63467,0.285211,0.471339,0.603573,0.650803,0.968183,,0.179627,,0.946372,0.002415,0.378891,0.431357,0.0,0.0,0.0,0.471209,0.0,0.284786,0.0,0.0,0.224548,0.647096,0.785727,0.136752,0.154477,,0.0,0.0,0.0,0.258521,0.413497,0.367302,0.0,0.0,0.466929,0.478065,0.0,,0.206908,0.0,0.0,,0.503041,0.0,0.127508,0.237357,0.2,0.0,0.145224,0.260632,0.525637,0.0,0.0,0.318229,0.960881,0.690379,0.331919,0.082969,0.385394,0.449542,0.322028,0.797745,0.300884,0.475524,0.0,0.0,0.0,0.0,0.5,0.333333,0.0,0.333333,0.784617
4,CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21,1,0.451167,0.254286,,0.627182,0.734317,0.0,0.302576,0.779787,0.705761,0.841296,0.441888,0.555645,0.553019,0.713739,0.582748,0.698454,0.495287,0.829251,0.245379,0.55836,0.398602,0.216215,0.0,0.388349,,0.0,0.0,0.593872,0.796038,0.0,,0.234842,0.536058,0.581323,0.655027,0.462063,,0.0,0.206923,0.0,0.239743,0.515395,0.519931,0.0,0.50552,0.426849,0.909409,0.0,0.963858,0.507559,,0.0,0.0,0.516255,0.0,0.558035,0.422907,0.551141,0.0,0.191799,0.602523,0.595732,0.652899,0.255532,0.718533,0.285426,0.0,0.425754,0.376021,0.763547,0.425442,0.564931,0.281548,,0.343434,0.333333,0.0,0.5,0.5,1.0,0.666667,0.0,0.666667,0.653165
5,CC1CCCCN1S(=O)(=O)c1ccc(NC(=O)c2cccc(-n3cnnn3)...,1,0.68712,0.011903,,0.603084,0.756183,0.0,0.397779,0.721096,0.47037,0.841302,0.450029,0.637532,0.276993,0.475525,0.413471,0.691891,0.671808,0.804954,0.237233,0.280757,,0.216215,0.271421,0.0,0.842529,0.5,0.0,0.312903,0.77809,0.431906,0.775001,0.241702,,0.619584,0.288141,0.462063,0.441882,0.0,,0.0,0.693065,0.272645,0.621046,0.0,0.255396,0.235862,0.478065,0.0,0.0,0.812868,0.372101,0.0,0.0,0.260819,0.0,0.845959,0.422907,0.551141,0.0,0.546921,0.577089,0.419029,0.689993,0.917459,0.121068,0.577522,0.0,0.310236,0.689482,0.469877,0.362327,0.780022,0.470461,,0.463636,0.333333,0.0,0.5,0.5,0.5,0.666667,0.5,0.666667,0.479776
6,COCCN(C(=O)C(F)(F)F)C(C(=O)NC1CCCCC1)c1ccco1,1,0.725802,0.062224,,0.783119,0.59845,0.0,,0.304106,0.777778,0.225887,0.211252,0.95183,0.278023,0.551263,0.302962,0.240332,0.43497,0.683517,0.753103,0.621451,0.129839,0.787737,0.247095,0.457238,0.0,0.5,,0.625807,0.0,,0.0,0.372845,0.3508,0.51195,0.350414,,0.171888,0.0,0.424173,0.0,0.940255,0.465721,0.208524,0.0,0.0,0.235862,,0.0,0.0,0.777614,0.531831,0.0,0.0,0.0,0.0,0.486874,,0.949427,0.0,0.597717,0.120414,0.698382,0.760473,0.0,0.0,0.147324,0.394538,0.607843,0.161581,0.210825,,0.391806,0.187107,0.637116,1.0,0.333333,,0.0,0.5,0.5,0.0,0.0,0.333333,0.543395
7,COc1ccccc1N1CCN(C(=O)CSc2nnc(-c3ccoc3C)o2)CC1,1,0.589006,0.18662,0.760305,0.454138,0.718376,0.0,0.485497,0.223332,0.52682,0.838093,0.680136,0.50527,0.202245,0.567278,0.324523,0.715762,0.484659,0.879441,0.1113,0.334385,,0.950413,0.49373,0.0,0.49654,0.940676,0.0,0.312903,0.0,0.0,,0.462505,0.353397,0.680447,0.827068,0.642288,0.489763,0.0,0.626803,0.0,0.226765,,0.415539,0.0,0.772542,0.427504,0.478065,0.5,0.5,0.949158,0.177991,0.342274,0.0,0.52528,0.0,0.609319,0.0,0.2,0.0,0.378587,0.73634,0.654565,0.35069,0.438164,0.631053,0.418333,0.584917,0.403449,0.432592,0.506187,,0.492584,0.678998,0.731735,0.540909,0.0,0.0,0.5,0.5,1.0,0.666667,0.5,0.666667,0.597627
8,O=c1c2ccccc2nc(SCc2ccccc2)n1Cc1ccco1,1,0.696413,0.128445,0.712419,0.203925,0.510507,0.0,0.425045,0.301594,0.22,0.838092,0.731563,0.304989,0.623917,0.386536,0.624505,0.712753,0.975802,0.719003,0.415005,0.381703,0.232953,0.17963,0.247095,0.390232,0.0,0.470552,0.0,0.610964,0.296038,0.0,0.0,,0.435731,0.149528,0.645577,0.154477,0.470715,0.0,0.396538,0.0,0.325852,0.0,,0.0,0.0,0.246619,0.0,0.0,0.5,0.150975,0.456532,0.0,0.0,0.5,0.0,0.264204,0.0,0.2,0.0,0.180503,0.419919,0.370461,0.165879,0.509229,0.954687,0.336188,0.405188,0.426922,0.092106,0.513409,0.862704,0.496628,0.272602,0.300884,0.154545,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.666667,0.837762
9,N#Cc1c(NC(=O)CSc2nnnn2C2CCCC2)sc2c1CCCC2,1,0.538087,0.349155,0.678881,0.775696,0.636673,0.0,0.315656,0.7388,0.722222,0.838094,0.230908,0.510981,0.650874,0.807905,0.720638,0.716004,0.496693,0.862392,0.182921,0.656151,0.376062,0.216215,0.474879,0.0,0.929975,0.0,0.0,0.312903,0.27809,0.527881,,0.476227,0.867625,0.126767,0.472612,0.167675,0.783084,,0.838958,0.0,,0.254442,0.115794,0.0,0.272542,0.235862,0.420376,0.0,0.981929,0.503731,0.65472,0.560152,0.0,0.0,0.0,0.718545,0.0,0.419495,0.0,0.378587,0.534816,0.959084,0.87899,0.255532,0.081866,0.745695,0.0,0.330013,,0.499925,0.482044,,0.237277,0.300884,0.909091,0.333333,,0.0,1.0,1.0,0.333333,0.0,0.666667,0.644677


In [17]:
scaled_df_1 = scaled_df[scaled_df['ALDH1-inhibitor']=='1']
scaled_df_0 = scaled_df[scaled_df['ALDH1-inhibitor']=='0']
corr_df_1= scaled_df_1.copy()
corr_df_0=scaled_df_0.copy()

In [18]:
scaled_df.corr()

  scaled_df.corr()


Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,AvgIpc,BalabanJ,HallKierAlpha,Ipc,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,NumAromaticRings,NumSaturatedHeterocycles,RingCount,MolLogP
MaxAbsEStateIndex,1.0,-0.111562,-0.053491,-0.089767,0.562445,,0.014501,0.072793,-0.297042,0.061701,-0.15179,0.403396,-0.323483,0.191793,-0.278736,0.078413,0.128414,0.385645,-0.298299,-0.35983,0.311926,0.037893,0.186423,0.076738,0.122348,0.192262,-0.013737,0.196816,0.228667,0.047009,0.010712,0.137822,0.303566,0.169174,0.193484,0.192287,0.108666,,0.15151,-0.058202,0.131047,0.167982,0.365378,,0.053615,0.092756,0.123231,-0.001768,-0.049548,0.224829,0.236958,0.067814,,0.115262,,0.144137,0.210274,0.259887,,0.210439,0.229328,0.123303,0.162916,0.140288,0.071,-0.036029,-0.013139,0.173372,-0.14617,-0.004674,-0.066495,0.027473,0.048221,0.036395,0.00933,-0.067735,,0.244254,0.224539,0.075643,0.334484,0.161042,0.547232,0.221881
MinAbsEStateIndex,-0.111562,1.0,0.06802,0.084876,-0.213581,,-0.00056,-0.008972,-0.001419,-0.066323,0.087062,-0.19373,0.121288,-0.056407,0.189541,-0.082865,0.176107,-0.119183,0.097947,0.04728,-0.107658,-0.004667,0.004045,-0.039087,-0.124149,-0.010746,-0.047616,-0.139218,-0.007256,-0.039549,-0.047011,-0.03153,-0.044892,-0.090518,-0.111614,-0.168145,-0.071338,,-0.032111,-0.030027,-0.175805,-0.055978,0.018263,,0.016221,-0.01485,0.021653,-0.00951,-0.05549,-0.172781,-0.203161,-0.062289,,0.023579,,-0.140151,-0.115943,-0.253318,,-0.39061,-0.043702,-0.033925,-0.166537,-0.012697,0.079759,1.6e-05,0.005886,-0.156784,-0.076458,0.076816,0.01226,0.004827,0.002751,-0.035011,-0.157017,-0.014158,,-0.087224,-0.122582,0.059897,0.039537,-0.051078,-0.075087,0.035519
MinEStateIndex,-0.053491,0.06802,1.0,0.219844,-0.155934,,-0.396634,0.229562,0.032832,0.169732,0.148554,-0.156019,0.101234,-0.107878,0.259553,0.141257,0.387155,0.027143,-0.061213,0.26655,-0.097642,-0.184052,-0.06912,0.020169,0.027625,-0.186965,-0.285443,-0.22338,-0.134971,0.05034,0.137069,-0.038658,0.011634,0.073897,-0.085635,-0.476174,-0.0732,,0.105229,-0.026366,-0.050462,0.094818,-0.095436,,0.099729,-0.136783,-0.163139,0.0426,0.146497,-0.071214,-0.162498,-0.153422,,0.087163,,-0.460876,-0.704783,-0.712964,,-0.360918,-0.016287,0.244847,-0.091375,-0.149923,0.221011,0.285715,0.045962,-0.482442,-0.171098,0.189781,0.554182,0.11382,0.083837,0.071958,0.046205,-0.034382,,0.012533,-0.003493,0.137183,0.044788,0.01904,0.046576,0.080692
qed,-0.089767,0.084876,0.219844,1.0,-0.409941,,-0.20319,0.082086,0.160578,0.001919,0.025323,0.067751,-0.098799,-0.011052,0.007062,-0.043622,0.207069,-0.234631,0.205743,0.377548,-0.272514,-0.080942,-0.002355,-0.110244,-0.028386,-0.144055,-0.163405,-0.266263,-0.121965,-0.025245,-0.049392,-0.040742,-0.095373,-0.099199,-0.178759,-0.159498,-0.204621,,-0.184133,-0.040695,0.037071,0.020134,-0.235409,,-0.07038,-0.108175,-0.01048,-0.039372,-0.017553,-0.194859,-0.096834,-0.071916,,-0.185756,,-0.443357,-0.159748,-0.235907,,-0.187072,-0.133023,0.017196,-0.102517,-0.08239,-0.00075,-0.050112,-0.085326,-0.278499,-0.155954,-0.073333,0.107018,-0.055671,0.039108,-0.019652,0.117945,-0.068348,,0.09219,0.134254,-0.130245,-0.32998,0.021154,-0.194986,-0.122572
MolWt,0.562445,-0.213581,-0.155934,-0.409941,1.0,,0.163698,-0.069306,-0.353387,0.222286,-0.118276,0.327095,-0.304319,0.196782,-0.242847,0.254695,-0.070747,0.55633,-0.557021,-0.537621,0.66663,0.255686,0.129335,0.170311,0.229758,0.249872,0.097465,0.248601,0.263475,0.053803,7.8e-05,0.215365,0.446616,0.291172,0.325512,0.400561,0.387038,,0.223477,-0.011645,0.179099,0.345827,0.409163,,0.145892,0.266111,0.131982,0.11791,0.155597,0.518009,0.399138,-0.00677,,0.138943,,0.48977,0.262014,0.363418,,0.387381,0.426467,0.104346,0.200838,0.218048,0.107037,0.085273,0.183615,0.350326,-0.037831,0.030699,-0.031403,-0.009451,0.099994,0.021828,0.087139,0.053379,,0.237698,0.191843,0.092145,0.42798,0.197605,0.580255,0.277331
NumRadicalElectrons,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
MaxPartialCharge,0.014501,-0.00056,-0.396634,-0.20319,0.163698,,1.0,-0.105956,-0.028004,-0.103354,-0.014054,0.130585,0.029284,0.00464,-0.200827,-0.048824,-0.233601,0.048607,-0.03432,-0.213976,0.110136,0.086912,0.056705,-0.082683,-0.094959,0.269545,0.762296,0.218539,0.194615,0.0565,-0.067838,-0.04456,0.010105,-0.098794,0.07893,0.306904,0.104942,,-0.000996,-0.044687,-0.009706,0.021305,0.06052,,-0.082256,0.130535,0.134058,-0.004463,-0.061238,0.151861,0.103645,0.015588,,0.003819,,0.352213,0.290367,0.360567,,0.226669,0.001729,-0.101383,0.105544,0.110801,-0.163017,-0.118921,0.039266,0.414818,-0.009165,-0.145569,-0.28798,-0.051082,-0.078921,0.050894,-0.028453,0.035047,,0.021798,-0.001202,0.005837,0.016402,-0.012172,-0.004552,-0.073843
MinPartialCharge,0.072793,-0.008972,0.229562,0.082086,-0.069306,,-0.105956,1.0,-0.033804,0.191322,0.03629,0.044753,-0.003129,-0.110253,-0.05129,0.151351,-0.054004,-0.019694,0.016233,0.225323,-0.117724,-0.548702,-0.263199,-0.166083,0.036116,0.015799,-0.083113,0.093715,0.064464,0.07532,0.091318,0.113611,-0.07938,0.073069,-0.182352,-0.42771,0.13781,,0.14611,0.016485,0.045222,-0.102032,0.00146,,-0.387504,-0.200167,0.045048,-0.570883,0.164031,-0.094262,-0.009788,0.059105,,-0.032585,,-0.16943,-0.042833,-0.075392,,-0.11943,-0.203137,0.111743,0.097037,-0.130057,0.007835,0.130635,-0.257653,-0.048409,-0.095671,0.056551,-0.116927,0.115449,0.060846,-0.225821,0.019627,0.000561,,0.028559,0.027065,0.02949,-0.037163,0.062693,0.0073,-0.001424
FpDensityMorgan1,-0.297042,-0.001419,0.032832,0.160578,-0.353387,,-0.028004,-0.033804,1.0,0.101045,0.092747,-0.018047,0.134572,-0.013724,0.002192,0.133891,-0.117146,-0.083602,0.275864,0.381542,-0.230001,-0.021712,0.035047,0.010481,-0.013262,-0.150499,0.033516,-0.023738,-0.063178,0.071859,0.112142,-0.25926,-0.21587,-0.169681,-0.07648,-0.068449,-0.064105,,0.052825,0.12614,0.008674,-0.140693,-0.481416,,-0.087721,-0.085587,-0.070277,-0.020793,0.189444,-0.08195,-0.124885,0.08602,,-0.053991,,-0.061296,-0.093144,-0.087141,,-0.034353,-0.079117,-0.041427,-0.034438,-0.131033,-0.126369,0.09239,0.027582,-0.074709,0.159744,-0.113975,0.031744,0.040643,0.021441,0.02635,0.167196,0.108229,,0.0164,0.068462,0.120505,-0.335228,-0.039751,-0.291718,-0.283499
BCUT2D_MWHI,0.061701,-0.066323,0.169732,0.001919,0.222286,,-0.103354,0.191322,0.101045,1.0,0.25925,-0.022134,0.133088,0.044554,0.023908,0.798854,0.036136,0.121102,-0.010042,0.210769,-0.006271,-0.19776,-0.074777,-0.012382,0.136639,-0.025518,-0.073711,-0.033744,0.171698,0.106476,0.038283,0.126577,0.060767,0.056501,0.010746,-0.065314,0.539144,,0.08728,-0.046308,-0.094006,0.034815,-0.019462,,-0.051101,-0.059797,0.038716,-0.079123,0.779177,0.030444,0.109002,-0.125262,,0.010329,,0.01657,0.052076,0.004588,,0.020391,0.120091,-0.064713,0.160949,0.108678,-0.058296,0.022231,0.260113,-0.167582,0.020677,-0.102048,0.010358,-0.063163,-0.130515,-0.071616,-0.104168,-0.007383,,-0.050321,-0.095929,0.078817,0.052578,-0.089611,-0.00193,0.129699


# PCA

In [19]:
from sklearn.decomposition import PCA
descriptors_with_ALDH1=descriptors
descriptors_with_ALDH1.insert(0,'ALDH1-inhibitor')
X=scaled_df[descriptors_with_ALDH1].values
X.shape

(1000, 85)

In [20]:
pca_85=PCA(n_components=85)
pca_85.fit(X)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values