# PYTHON CODE SAMPLE PART 3 - DESCRIPTOR CALCULATION AND DATASET PREPARATION

In this part I will be using PaDEL-Descriptor to calculate molecular descriptors that are essentially quantitative description of the compounds in the dataset. Produced descriptors will be used as predictor variables in model building in part 4 and 5.

In [None]:
! wget https://github.com/Baksic-Ivan/Python_template-Computational_Drug_Discovery/blob/main/padel.zip

In [None]:
! wget https://raw.githubusercontent.com/Baksic-Ivan/Python_template-Computational_Drug_Discovery/main/padel.sh

In [None]:
! unzip padel.zip

In [None]:
#! wget https://raw.githubusercontent.com/Baksic-Ivan/Python_template-Computational_Drug_Discovery/main/EGFR_bioactivity_data_3class_pIC50.csv

In [4]:
import pandas as pd
df_3class = pd.read_csv('EGFR_bioactivity_data_3class_pIC50.csv')
df_3class

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,active,383.814,4.45034,3,4,7.387216
1,1,CHEMBL69960,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,active,482.903,3.61432,3,6,6.769551
2,2,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,intermediate,369.432,4.77200,1,6,5.031517
3,3,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,inactive,283.287,2.31056,2,4,3.301030
4,4,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,inactive,164.160,1.49000,2,2,2.522879
...,...,...,...,...,...,...,...,...,...
9013,9013,CHEMBL4520788,CNCc1ccccc1-c1csc([C@H](C)Nc2nc(C)nc3cc(OC)c(O...,active,448.592,5.57642,2,7,7.698970
9014,9014,CHEMBL1800685,O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)c2ccc(-c3ccc(C(...,active,475.510,7.35780,2,2,8.338187
9015,9015,CHEMBL4088216,CN1C(=O)[C@@H](N2CCc3cn(Cc4ccccc4)nc3C2=O)COc2...,active,402.454,2.35370,0,5,6.000000
9016,9016,CHEMBL4549667,CN1C(=O)[C@@H](N2CCc3c(nn(Cc4ccccc4)c3Br)C2=O)...,active,481.350,3.11620,0,5,6.000000


In [10]:
df_3class_selection = df_3class[['canonical_smiles','molecule_chembl_id']]
df_3class_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

### Calculating fingerprint descriptors

In [None]:
# before running this make sure to add one of the Java path to PATH variable
! bash padel.sh

### Preparing predictor and response variables
#### Predictor

In [7]:
df_3class_predictor = pd.read_csv('descriptors_output.csv')
df_3class_predictor = df_3class_predictor.drop(columns=['Name'])
df_3class_predictor

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9013,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9014,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9015,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9016,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Response

In [8]:
df_3class_response = df_3class['pIC50']
df_3class_response

0       7.387216
1       6.769551
2       5.031517
3       3.301030
4       2.522879
          ...   
9013    7.698970
9014    8.338187
9015    6.000000
9016    6.000000
9017    6.000000
Name: pIC50, Length: 9018, dtype: float64

In [9]:
df_ML = pd.concat([df_3class_predictor, df_3class_response], axis = 1)
df_ML

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.387216
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.769551
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.031517
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,3.301030
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2.522879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9013,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.698970
9014,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.338187
9015,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.000000
9016,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.000000


In [10]:
df_ML.to_csv('EGFR_bioactivity_data_pIC50_fp.csv', index = False)
     