# FYP - Drug Design Active Learning : Dataset Generation

A simple script to pre-process the dirty data from the 3 given datasets and combine them in an intelligent manner to be used later.

In [1]:
from dask.distributed import Client, progress
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import time

from rdkit import Chem
from rdkit.Chem import RDKFingerprint
from rdkit.Chem import rdMolDescriptors as rdmd

from sklearn.preprocessing import LabelEncoder

## Parsing the datset

In [2]:
start = time.time()
path_1 = os.path.dirname(os.getcwd()) + '/data/datasets/ci9b00375_si_001.txt'
path_tableS2 = os.path.dirname(os.getcwd()) + '/data/datasets/ci9b00375_si_002.txt'
path_tableS3 = os.path.dirname(os.getcwd()) + '/data/datasets/ci9b00375_si_003_modified.txt'

In [3]:
# parameters
fingerprint = 'morgan'
radius = 2

In [4]:
df_table1 = dd.read_csv(path_1,
                   dtype= {'#cmpd':np.int64},
                   sep='\t',
                   on_bad_lines='warn')
df_table1 = df_table1.dropna()

In [5]:
df_table1.compute().head()

Unnamed: 0,assay_id,#cmpd,assay_type,target_family
0,157,65,B,Ion channel
1,517,70,F,Phenotypic
2,535,69,B,Phenotypic
3,831,55,B,GPCR
4,924,81,B,GPCR


In [6]:
df_table2 = dd.read_csv(path_tableS2,
                        sep=' ',
                        usecols = ['CompoundID', 'AssayID', 'expt_pIC50', 'max2‐pQSAR_pIC50', 'Clustering'],
                        dtype=str,
                        skiprows=46,
                        on_bad_lines='warn', 
                        skip_blank_lines=True)

df_table2['expt_pIC50'] = df_table2['expt_pIC50'].str.replace('\u2010', '-', regex=False).astype(np.float32)
df_table2['max2‐pQSAR_pIC50'] = df_table2['max2‐pQSAR_pIC50'].str.replace('\u2010', '-', regex=False).astype(np.float32)
df_table2 = df_table2.dropna()
df_table2['AssayID'] = df_table2['AssayID'].astype(np.int64)
df_table2 = df_table2.rename(columns = {'AssayID':'assay_id'})

In [7]:
df_table2.compute().head()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering
0,CHEMBL2323521,941825,4.257,4.5689,TRN
1,CHEMBL178687,941825,4.2477,4.4279,TRN
2,CHEMBL2323490,941825,4.2999,4.1577,TRN
3,CHEMBL2323493,941825,4.8162,5.0078,TRN
4,CHEMBL2323484,941825,3.0,3.1545,TRN


In [8]:
df_table3 = dd.read_csv(path_tableS3,
                        sep=',',
                       )

In [9]:
df_table3.compute().head()

Unnamed: 0,CompoundID,SMILES
0,CHEMBL35001,COC(=O)Nc1ccc(Cl)c(-c2nc3cc(C)ccc3o2)c1
1,CHEMBL35211,COC(=O)Nc1ccc(OC)c(-c2nc3cc(C)ccc3o2)c1
2,CHEMBL35333,COC(=O)Nc1ccc(Cl)c(-c2nc3ccc(C)cc3n2C)c1
3,CHEMBL35967,C#CCNC(=O)Nc1ccc(Cl)c(-c2nc3cc(Cl)ccc3o2)c1
4,CHEMBL35458,COC(=O)Nc1ccc(Cl)c(-c2nc3ccccc3o2)c1


In [10]:
df_table3[df_table3['CompoundID'].str.contains('CHEM')==True].compute().describe()

Unnamed: 0,CompoundID,SMILES
count,496946,496946
unique,496946,496946
top,CHEMBL35001,COC(=O)Nc1ccc(Cl)c(-c2nc3cc(C)ccc3o2)c1
freq,1,1


In [11]:
temp_df = dd.merge(df_table2,df_table3,
                  how='left',
                  on='CompoundID',
                  )

In [12]:
temp_df.compute().head()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,SMILES
0,CHEMBL2323521,941825,4.257,4.5689,TRN,COc1ccc2c(c1)c(C)c(CCC(=O)NS(=O)(=O)c1ccc(C(C)...
1,CHEMBL178687,941825,4.2477,4.4279,TRN,COc1ccc2c(c1)c(CCC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
2,CHEMBL2323490,941825,4.2999,4.1577,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)C(F)(F)F)cn2C(=...
3,CHEMBL2323493,941825,4.8162,5.0078,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)c1ccccc1C)cn2C(...
4,CHEMBL2323484,941825,3.0,3.1545,TRN,COC(=O)Cc1cn(C(=O)c2ccc(Cl)cc2)c2ccc(OC)cc12


In [13]:
temp_df[temp_df['SMILES'].astype(str).str.contains('Na') == True].compute()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,SMILES


In [14]:
temp_df.compute()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,SMILES
0,CHEMBL2323521,941825,4.2570,4.5689,TRN,COc1ccc2c(c1)c(C)c(CCC(=O)NS(=O)(=O)c1ccc(C(C)...
1,CHEMBL178687,941825,4.2477,4.4279,TRN,COc1ccc2c(c1)c(CCC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
2,CHEMBL2323490,941825,4.2999,4.1577,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)C(F)(F)F)cn2C(=...
3,CHEMBL2323493,941825,4.8162,5.0078,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)c1ccccc1C)cn2C(...
4,CHEMBL2323484,941825,3.0000,3.1545,TRN,COC(=O)Cc1cn(C(=O)c2ccc(Cl)cc2)c2ccc(OC)cc12
...,...,...,...,...,...,...
1368495,CHEMBL3577503,1496999,5.1367,5.0442,TRN,CCN(CCCOc1ccc(-c2cc(=O)c3c(OC)c(OC)c(OC)cc3o2)...
1368496,CHEMBL3577545,1496999,5.8861,6.0016,TRN,CCN(CCCCCCOc1cccc(-c2cc(=O)c3c(OC)c(OC)c(OC)cc...
1368497,CHEMBL3577504,1496999,5.0655,5.1974,TRN,COc1ccccc1CN(C)CCCOc1ccc(-c2cc(=O)c3c(OC)c(OC)...
1368498,CHEMBL3577518,1496999,4.5768,4.2350,TST,COc1cc2oc(-c3ccc(OCCCCCCN4CCN(Cc5ccccc5)CC4)cc...


In [15]:
temp_df = dd.merge(temp_df,df_table1,
                  how='left',
                  on='assay_id',
                  )
temp_df = temp_df.rename(columns = {'SMILES':'smiles'})

In [16]:
temp_df.compute().head()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,smiles,#cmpd,assay_type,target_family
0,CHEMBL2323521,941825,4.257,4.5689,TRN,COc1ccc2c(c1)c(C)c(CCC(=O)NS(=O)(=O)c1ccc(C(C)...,58,B,Phenotypic
1,CHEMBL178687,941825,4.2477,4.4279,TRN,COc1ccc2c(c1)c(CCC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1,58,B,Phenotypic
2,CHEMBL2323490,941825,4.2999,4.1577,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)C(F)(F)F)cn2C(=...,58,B,Phenotypic
3,CHEMBL2323493,941825,4.8162,5.0078,TRN,COc1ccc2c(c1)c(CC(=O)NS(=O)(=O)c1ccccc1C)cn2C(...,58,B,Phenotypic
4,CHEMBL2323484,941825,3.0,3.1545,TRN,COC(=O)Cc1cn(C(=O)c2ccc(Cl)cc2)c2ccc(OC)cc12,58,B,Phenotypic


In [None]:
molecular_space = temp_df.compute()[['CompoundID','smiles']].drop_duplicates(subset='CompoundID')
molecular_space.head()

### Assay-Wise File Generation

In [None]:
temp_df = temp_df.compute().sort_values(by = ['assay_id'], ascending = True)
temp_df.head()

In [None]:
unique_assays = temp_df['assay_id'].unique()
print(len(unique_assays))

In [None]:
pc_values_trn = []
pc_values_tst = []

try:
    for i in unique_assays:
        temp = temp_df.loc[temp_df['assay_id'] == i]
        train_subset = temp.loc[temp['Clustering'] == 'TRN']
        test_subset = temp.loc[temp['Clustering'] == 'TST']
        
        pearson_coeff_trn = np.corrcoef(train_subset['expt_pIC50'], train_subset['max2‐pQSAR_pIC50'])
        pearson_coeff_tst = np.corrcoef(test_subset['expt_pIC50'], test_subset['max2‐pQSAR_pIC50'])
        
        pc_values_trn.append(pearson_coeff_trn[0,1]**2)
        pc_values_tst.append(pearson_coeff_tst[0,1]**2)
        
        if len(pc_values_trn)%500 == 0:
            print("{} assay_id's traversed".format(len(pc_values_trn)))
            
except KeyError:
    print("Key Error on 'Dataframe' and/or Series")

In [None]:
df_pc = pd.concat([pd.DataFrame(unique_assays, columns=['assay_id']), pd.DataFrame(pc_values_trn, columns=['squared_pearson_trn']), pd.DataFrame(pc_values_tst, columns=['squared_pearson_tst'])], axis =1)
df_pc.head()

In [None]:
df_nan = df_pc.loc[df_pc['squared_pearson_tst'].isnull()]
df_pc = df_pc.loc[~df_pc['squared_pearson_tst'].isnull()]
df_nan

## Saving the preprocessed data onto drive

In [None]:
if os.path.isdir('../data/data_temp/default_{}/'.format(fingerprint)) == False:
    os.mkdir('../data/data_temp/default_{}/'.format(fingerprint))
    
# Saving the complete preprocessed file

if os.path.isdir('../data/data_temp/default_{}/preprocessed/'.format(fingerprint)) == False:
    os.mkdir('../data/data_temp/default_{}/preprocessed/'.format(fingerprint))

try:
    temp_df.to_csv('../data/data_temp/default_{}/preprocessed/pre_processed_file.csv'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Storage File ".csv" unable to be written')
    
try:
    temp_df.to_parquet('../data/data_temp/default_{}/preprocessed/pre_processed_file.parquet'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Storage File ".parquet" unable to be written')

# Saving the molecular space file

if os.path.isdir('../data/data_temp/default_{}/mol_space/'.format(fingerprint)) == False:
    os.mkdir('../data/data_temp/default_{}/mol_space/'.format(fingerprint))
    
try:
    molecular_space.to_csv('../data/data_temp/default_{}/mol_space/molecular_space_file.csv'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Molecular Space ".csv" unable to be written')
    
try:
    molecular_space.to_parquet('../data/data_temp/default_{}/mol_space/molecular_space_file.parquet'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Molecular Space ".parquet" unable to be written')

# Saving the assay_id file

if os.path.isdir('../data/data_temp/default_{}/assay_id/'.format(fingerprint)) == False:
    os.mkdir('../data/data_temp/default_{}/assay_id/'.format(fingerprint))
    
try:
    df_pc.to_csv('../data/data_temp/default_{}/assay_id/assay_id_file.csv'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Assay ID list ".csv" unable to be written')
    
try:
    df_pc.to_parquet('../data/data_temp/default_{}/assay_id/assay_id_file.parquet'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Assay ID list ".parquet" unable to be written')

# Saving the assay_id file

try:
    df_nan.to_csv('../data/data_temp/default_{}/assay_id/assay_id_null_file.csv'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Assay ID list ".csv" with null pearon coefficients is unable to be written')
    
try:
    df_nan.to_parquet('../data/data_temp/default_{}/assay_id/assay_id_null_file.parquet'.format(fingerprint), index=False)
except FileNotFoundError:
    print('Assay ID list ".parquet" with null pearon coefficients is unable to be written')


## Deleting Unnecessary variables

In [None]:
del temp_df
del molecular_space
del df_pc
del df_nan
del pc_values_trn
del pc_values_tst
del df_table1
del df_table2
del df_table3

## Saving the processed molecular fingerprints

In [25]:
molecular_space = pd.read_parquet('../data/data_temp/default_{}/mol_space/molecular_space_file.parquet'.format(fingerprint))
molecular_space.shape

(496946, 2)

In [16]:
fingerprint_df = []

for row in molecular_space.itertuples():
    if fingerprint == 'morgan':
        mol = Chem.MolFromSmiles(row.smiles)
        temp_fingerprint = rdmd.GetMorganFingerprintAsBitVect(mol,
                                                              radius=2)
    temp_fingerprint = np.array(temp_fingerprint)
    fingerprint_df.append(temp_fingerprint)
    if len(fingerprint_df)%100000 == 0:
        print('{0} {1} fingerprints have been calculated'.format(len(fingerprint_df),fingerprint))
print('x-----x-----x')

100000 morgan fingerprints have been calculated
200000 morgan fingerprints have been calculated
300000 morgan fingerprints have been calculated
400000 morgan fingerprints have been calculated
x-----x-----x


In [5]:
smiles_column_names = []
try:
    for i in range(0,len(fingerprint_df[0])):
        smiles_column_names.append('smiles_{}_{}'.format(fingerprint,i))
except NameError:
    print('Name Error retrieved')

Name Error retrieved


In [6]:
read_existing_file = False

try:
    temp_df = pd.concat([molecular_space['CompoundID'],pd.DataFrame(fingerprint_df, columns=smiles_column_names)], axis=1)
except NameError:
    print('Ran into NameError issues. Trying to read from a previously successful run')
    temp_df = pd.read_parquet('../data/data_temp/default_{0}/fingerprint/{0}_fingerprint_file.parquet'.format(fingerprint))
    print('Loading file completed')
    read_existing_file = True
except MemoryError:
    print('Ran into memory issues. Trying to read from a previously successful run')
    temp_df = pd.read_parquet('../data/data_temp/default_{0}/fingerprint/{0}_fingerprint_file.parquet'.format(fingerprint))
    print('Loading file completed')
    read_existing_file = True
except FileNotFoundError:
    print('Previously created {} fingerprint file is absent'.format(fingerprint))

Ran into NameError issues. Trying to read from a previously successful run
Loading file completed


In [7]:
temp_df.head()

Unnamed: 0,CompoundID,smiles_morgan_0,smiles_morgan_1,smiles_morgan_2,smiles_morgan_3,smiles_morgan_4,smiles_morgan_5,smiles_morgan_6,smiles_morgan_7,smiles_morgan_8,...,smiles_morgan_2038,smiles_morgan_2039,smiles_morgan_2040,smiles_morgan_2041,smiles_morgan_2042,smiles_morgan_2043,smiles_morgan_2044,smiles_morgan_2045,smiles_morgan_2046,smiles_morgan_2047
0,CHEMBL2323521,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL178687,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL2323490,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL2323493,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL2323484,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
if read_existing_file==False:
    
    if os.path.isdir('../data/data_temp/default_{}/fingerprint/'.format(fingerprint)) == False:
        os.mkdir('../data/data_temp/default_{}/fingerprint/'.format(fingerprint))
    
    try:
        temp_df.to_csv('../data/data_temp/default_{0}/fingerprint/{0}_fingerprint_file.csv'.format(fingerprint), index=False)
        temp_df.to_parquet('../data/data_temp/default_{0}/fingerprint/{0}_fingerprint_file.parquet'.format(fingerprint), index=False)
    except FileNotFoundError:
        print('SMILES {} list ".parquet" and ".csv" unable to be written'.format(fingerprint))
else:
    print('No file was written to')

No file was written to


In [9]:
preprocessed_df = pd.read_parquet('../data/data_temp/default_{}/preprocessed/pre_processed_file.parquet'.format(fingerprint))
preprocessed_df.shape

(1368500, 9)

In [10]:
# complete_df = pd.concat([preprocessed_df, temp_df], join='inner')
preprocessed_df = preprocessed_df.merge(temp_df, how='inner')
# complete_df = preprocessed_df.join(temp_df, how='inner')
preprocessed_df.head()

Unnamed: 0,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,smiles,#cmpd,assay_type,target_family,smiles_morgan_0,...,smiles_morgan_2038,smiles_morgan_2039,smiles_morgan_2040,smiles_morgan_2041,smiles_morgan_2042,smiles_morgan_2043,smiles_morgan_2044,smiles_morgan_2045,smiles_morgan_2046,smiles_morgan_2047
0,CHEMBL305106,157,11.0,10.7912,TRN,O=c1c(Br)ccc2n1CC1CNCC2C1,65,B,Ion channel,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL292030,157,7.51,7.6117,TRN,C1=C(c2cnccn2)CC2CCC1N2,65,B,Ion channel,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL64138,157,9.85,9.5645,TRN,C1=C(c2cncnc2)C2CCC(CC1)N2,65,B,Ion channel,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL497939,157,9.91,9.6467,TRN,O=c1cccc2n1CC1CNCC2C1,65,B,Ion channel,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL497939,688155,5.5,5.2053,TST,O=c1cccc2n1CC1CNCC2C1,11204,F,Phenotypic,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
try:
    preprocessed_df.to_parquet('../data/data_temp/default_{0}/complete_file_{0}.parquet'.format(fingerprint), index=False)
    preprocessed_df.to_feather('../data/data_temp/default_{0}/complete_file_{0}.feather'.format(fingerprint))
    print('Writing to File(s) was successful')
except FileNotFoundError:
    print('SMILES {} list ".parquet" and ".csv" unable to be written'.format(fingerprint))

Writing to File(s) was successful


In [None]:
print('The execution time of this jupyter notebook is {}'.format(time.time() - start))