# Preprocessing of the Raw downloaded data

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

In [2]:
raw_data = pd.read_csv('datasets/raw_data.csv')
raw_data

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,82585,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,7.100
1,,,94540,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,50.000
2,,,112960,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.238
3,,,116766,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.057
4,,,118017,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,"{'action_type': 'INHIBITOR', 'description': 'N...",,24742461,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5118295,Inhibition of aromatase in human JEG-3 cells u...,B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,nM,UO_0000065,,50.100
2962,,,24783443,[],CHEMBL5130158,Inhibition of human placental microsome CYP19 ...,A,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,10.000
2963,"{'action_type': 'INHIBITOR', 'description': 'N...",,24886565,[],CHEMBL5157477,Inhibition of aromatase (unknown origin),B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.031
2964,"{'action_type': 'INHIBITOR', 'description': 'N...",,24886566,[],CHEMBL5157477,Inhibition of aromatase (unknown origin),B,,,BAO_0000190,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.042


In [3]:
# getting the column names of the dataframe
# data.info()
list(raw_data.columns)

['action_type',
 'activity_comment',
 'activity_id',
 'activity_properties',
 'assay_chembl_id',
 'assay_description',
 'assay_type',
 'assay_variant_accession',
 'assay_variant_mutation',
 'bao_endpoint',
 'bao_format',
 'bao_label',
 'canonical_smiles',
 'data_validity_comment',
 'data_validity_description',
 'document_chembl_id',
 'document_journal',
 'document_year',
 'ligand_efficiency',
 'molecule_chembl_id',
 'molecule_pref_name',
 'parent_molecule_chembl_id',
 'pchembl_value',
 'potential_duplicate',
 'qudt_units',
 'record_id',
 'relation',
 'src_id',
 'standard_flag',
 'standard_relation',
 'standard_text_value',
 'standard_type',
 'standard_units',
 'standard_upper_value',
 'standard_value',
 'target_chembl_id',
 'target_organism',
 'target_pref_name',
 'target_tax_id',
 'text_value',
 'toid',
 'type',
 'units',
 'uo_units',
 'upper_value',
 'value']

In [4]:
# select only required columns

data = raw_data[['canonical_smiles', 'molecule_chembl_id', 'standard_value']]
data

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_value
0,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,CHEMBL341591,7100.0
1,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,CHEMBL2111947,50000.0
2,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,CHEMBL431859,238.0
3,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,CHEMBL113637,57.0
4,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,CHEMBL112021,54.0
...,...,...,...
2961,C=C1C[C@@H]2[C@H](CC[C@]3(C)C(=O)CC[C@@H]23)[C...,CHEMBL1200374,50.1
2962,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,CHEMBL5184829,10000.0
2963,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,CHEMBL5176279,31.0
2964,COc1ccc2cc(/C=N/NC3=NC(=O)CS3)ccc2c1,CHEMBL5177928,42.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   canonical_smiles    2966 non-null   object 
 1   molecule_chembl_id  2966 non-null   object 
 2   standard_value      2890 non-null   float64
dtypes: float64(1), object(2)
memory usage: 69.6+ KB


In [6]:
data['standard_value'].describe()

count    2.890000e+03
mean     2.173576e+10
std      7.235369e+11
min      0.000000e+00
25%      1.000000e+02
50%      1.000000e+03
75%      7.370000e+03
max      3.388442e+13
Name: standard_value, dtype: float64

## Handling Missing Values

In [7]:
handled_standard_value = data[data['standard_value'].notna()]
handled_standard_value

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_value
0,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,CHEMBL341591,7100.0
1,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,CHEMBL2111947,50000.0
2,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,CHEMBL431859,238.0
3,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,CHEMBL113637,57.0
4,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,CHEMBL112021,54.0
...,...,...,...
2961,C=C1C[C@@H]2[C@H](CC[C@]3(C)C(=O)CC[C@@H]23)[C...,CHEMBL1200374,50.1
2962,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,CHEMBL5184829,10000.0
2963,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,CHEMBL5176279,31.0
2964,COc1ccc2cc(/C=N/NC3=NC(=O)CS3)ccc2c1,CHEMBL5177928,42.0


In [8]:
handled_canonical_smiles = handled_standard_value[handled_standard_value['canonical_smiles'].notna()]
handled_canonical_smiles

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_value
0,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,CHEMBL341591,7100.0
1,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,CHEMBL2111947,50000.0
2,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,CHEMBL431859,238.0
3,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,CHEMBL113637,57.0
4,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,CHEMBL112021,54.0
...,...,...,...
2961,C=C1C[C@@H]2[C@H](CC[C@]3(C)C(=O)CC[C@@H]23)[C...,CHEMBL1200374,50.1
2962,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,CHEMBL5184829,10000.0
2963,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,CHEMBL5176279,31.0
2964,COc1ccc2cc(/C=N/NC3=NC(=O)CS3)ccc2c1,CHEMBL5177928,42.0


## Check for Duplicate Values

In [9]:
len(handled_canonical_smiles['molecule_chembl_id'].unique())
len(handled_canonical_smiles['canonical_smiles'].unique())

2116

Duplicates values are there, drop duplicates values

In [10]:
data_nr = handled_canonical_smiles.drop_duplicates(['canonical_smiles'])
data_nr

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_value
0,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,CHEMBL341591,7100.00
1,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,CHEMBL2111947,50000.00
2,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,CHEMBL431859,238.00
3,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,CHEMBL113637,57.00
4,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,CHEMBL112021,54.00
...,...,...,...
2955,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,CHEMBL5203413,0.09
2956,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,CHEMBL5179009,0.72
2962,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,CHEMBL5184829,10000.00
2963,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,CHEMBL5176279,31.00


In [11]:
len(data_nr['molecule_chembl_id'].unique())

2116

In [12]:
smiles = []
for smile in data_nr['canonical_smiles'].tolist():
    cpd = str(smile).split('.')
    cpd_max = max(cpd, key=len)
    smiles.append(cpd_max)
data_nr['clean_smiles'] = smiles
data_nr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_nr['clean_smiles'] = smiles


Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_value,clean_smiles
0,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12
1,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...
2,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21
3,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21
4,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21
...,...,...,...,...
2955,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2956,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2962,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...
2963,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1


In [13]:
data_nr.isna().value_counts()

canonical_smiles  molecule_chembl_id  standard_value  clean_smiles
False             False               False           False           2116
dtype: int64

In [14]:
data_nr = data_nr.drop(columns=['canonical_smiles'])
data_nr

Unnamed: 0,molecule_chembl_id,standard_value,clean_smiles
0,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12
1,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...
2,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21
3,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21
4,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21
...,...,...,...
2955,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2956,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2962,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...
2963,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1


In [15]:
data_nr.rename(columns={'clean_smiles': 'canonical_smiles'}, inplace=True)
data_nr

Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles
0,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12
1,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...
2,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21
3,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21
4,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21
...,...,...,...
2955,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2956,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1
2962,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...
2963,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1


In [16]:
data_nr.to_csv('datasets/non_redundant_preprocessed_dataset.csv', index=False)

In [17]:
data_nr['standard_value'].max()

777000.0

In [19]:
data_nr['standard_value'].min()

0.0

In [20]:
# delete columns containing 0.0 as standard_value
data_nr = data_nr[data_nr['standard_value'] != float(0)]
data_nr

Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles,pIC50
0,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,5.148742
1,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,4.301030
2,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,6.623423
3,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,7.244125
4,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,7.267606
...,...,...,...,...
2955,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,10.045757
2956,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,9.142668
2962,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,5.000000
2963,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,7.508638


### Convert standard_value (IC50) to pIC50

In [21]:
pIC50 = []
for row in data_nr['standard_value']:
    molar = row * (10**-9)
    pIC50.append(-np.log10(molar))
data_nr['pIC50'] = pIC50
data_nr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_nr['pIC50'] = pIC50


Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles,pIC50
0,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,5.148742
1,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,4.301030
2,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,6.623423
3,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,7.244125
4,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,7.267606
...,...,...,...,...
2955,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,10.045757
2956,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,9.142668
2962,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,5.000000
2963,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,7.508638


In [24]:
data_nr['pIC50'].describe()

count    2115.000000
mean        6.064030
std         1.306238
min         3.109579
25%         5.070581
50%         5.920819
75%         6.911954
max        10.823909
Name: pIC50, dtype: float64

### Calculate Lipinski's RO5 for the dataset

In [26]:
mol_data = []
for smile in data_nr['canonical_smiles']:
    molecule = Chem.MolFromSmiles(smile)
    mol_data.append(molecule)

base_data = np.arange(1,1)
i=0
for molecule in mol_data:
    molecularWt = Descriptors.MolWt(molecule)
    cLogP = Descriptors.MolLogP(molecule)
    HAcceptors = Descriptors.NumHAcceptors(molecule)
    HDonors = Descriptors.NumHDonors(molecule)
    
    data = np.array([molecularWt, cLogP, HAcceptors, HDonors])
    if(i==0):
        base_data = data
    else:
        base_data = np.vstack([base_data, data])
    i = i + 1

descriptors = pd.DataFrame(data = base_data, columns=['mol_Wt', 'cLogP', 'H_Acceptors', 'H_Donors'])
descriptors


Unnamed: 0,mol_Wt,cLogP,H_Acceptors,H_Donors
0,329.528,4.28820,2.0,2.0
1,315.501,3.89810,2.0,2.0
2,412.306,5.70542,3.0,0.0
3,319.383,4.63450,3.0,0.0
4,321.811,4.58780,3.0,0.0
...,...,...,...,...
2110,368.396,3.93578,6.0,0.0
2111,382.423,4.32588,6.0,0.0
2112,373.416,3.11580,5.0,2.0
2113,369.446,3.85650,7.0,1.0


In [27]:
## Join two dataframe
data_nr.reset_index(inplace=True, drop=True)
descriptors.reset_index(inplace=True, drop=True)

lipinski_df = pd.concat([data_nr, descriptors], axis=1)
lipinski_df

Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles,pIC50,mol_Wt,cLogP,H_Acceptors,H_Donors
0,CHEMBL341591,7100.00,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,5.148742,329.528,4.28820,2.0,2.0
1,CHEMBL2111947,50000.00,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,4.301030,315.501,3.89810,2.0,2.0
2,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,6.623423,412.306,5.70542,3.0,0.0
3,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,7.244125,319.383,4.63450,3.0,0.0
4,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,7.267606,321.811,4.58780,3.0,0.0
...,...,...,...,...,...,...,...,...
2110,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,10.045757,368.396,3.93578,6.0,0.0
2111,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,9.142668,382.423,4.32588,6.0,0.0
2112,CHEMBL5184829,10000.00,O=C(Nc1ccc2[nH]ncc2c1)[C@]12ON=C(c3cccnc3)[C@H...,5.000000,373.416,3.11580,5.0,2.0
2113,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,7.508638,369.446,3.85650,7.0,1.0


In [29]:
lipinski_df.to_csv('datasets/lipinski_dataset.csv', index=False)

### Calculate PubChem fingerprints

In [30]:
smiles_df = lipinski_df[['canonical_smiles', 'molecule_chembl_id']]
smiles_df.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [31]:
# run padel to calculate fingerprints
!bash padel/padel.sh
# output file will be saved in 'descriptors_output.csv'

Processing CHEMBL2111947 in molecule.smi (2/2115). 
Processing CHEMBL431859 in molecule.smi (3/2115). 
Processing CHEMBL341591 in molecule.smi (1/2115). 
Processing CHEMBL113637 in molecule.smi (4/2115). 
Processing CHEMBL112021 in molecule.smi (5/2115). 
Processing CHEMBL324070 in molecule.smi (6/2115). 
Processing CHEMBL41761 in molecule.smi (7/2115). 
Processing CHEMBL111868 in molecule.smi (8/2115). 
Processing CHEMBL112074 in molecule.smi (10/2115). Average speed: 0.84 s/mol.
Processing CHEMBL111888 in molecule.smi (9/2115). Average speed: 1.59 s/mol.
Processing CHEMBL324326 in molecule.smi (11/2115). Average speed: 0.57 s/mol.
Processing CHEMBL37321 in molecule.smi (12/2115). Average speed: 0.44 s/mol.
Processing CHEMBL353068 in molecule.smi (13/2115). Average speed: 0.46 s/mol.
Processing CHEMBL41066 in molecule.smi (14/2115). Average speed: 0.33 s/mol.
Processing CHEMBL166709 in molecule.smi (15/2115). Average speed: 0.35 s/mol.
Processing CHEMBL424556 in molecule.smi (16/2115)

In [32]:
fingerprint = pd.read_csv('descriptors_output.csv')
fingerprint

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL112021,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL113637,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL111868,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL41761,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL431859,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,CHEMBL5203413,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2111,CHEMBL5177928,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2112,CHEMBL5179009,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2113,CHEMBL5176279,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
fingerprint.rename(columns={'Name' : 'molecule_chembl_id'}, inplace=True)
fingerprint

Unnamed: 0,molecule_chembl_id,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL112021,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL113637,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL111868,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL41761,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL431859,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,CHEMBL5203413,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2111,CHEMBL5177928,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2112,CHEMBL5179009,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2113,CHEMBL5176279,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Join two dataframes
lipinski_df.reset_index(inplace=True, drop=True)
fingerprint.reset_index(inplace=True, drop=True)

fingerprint_dataset = pd.merge(lipinski_df, fingerprint, on=['molecule_chembl_id'], how='right')
fingerprint_dataset

Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles,pIC50,mol_Wt,cLogP,H_Acceptors,H_Donors,PubchemFP0,PubchemFP1,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL112021,54.00,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,7.267606,321.811,4.58780,3.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL113637,57.00,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,7.244125,319.383,4.63450,3.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL111868,78.50,Cn1cc(C(c2ccc(F)cc2)n2ccnc2)c2cc(Br)ccc21,7.105130,384.252,4.91410,3.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL41761,41.00,CCn1ccc2cc(C(c3ccc(F)cc3)n3ccnc3)ccc21,7.387216,319.383,4.63450,3.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL431859,238.00,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,6.623423,412.306,5.70542,3.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,CHEMBL5203413,0.09,CC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,10.045757,368.396,3.93578,6.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2111,CHEMBL5177928,42.00,COc1ccc2cc(/C=N/NC3=NC(=O)CS3)ccc2c1,7.376751,299.355,2.40130,5.0,1.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2112,CHEMBL5179009,0.72,CCC#CCOc1ccc2cc(C(c3ccc(C#N)cc3)n3cncn3)oc2c1,9.142668,382.423,4.32588,6.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2113,CHEMBL5176279,31.00,CCOC(=O)Cc1csc(N/N=C/c2ccc3cc(OC)ccc3c2)n1,7.508638,369.446,3.85650,7.0,1.0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
fingerprint_dataset.to_csv('datasets/final_dataset.csv', index=False)