### Description
This jupyter file accesses the chembl online database and extracts from it Molecules that have a recorded IC50 (or standard value) with a specific KRas Protein (GTPase KRas).
It then extracts the SMILES Code and generates features using the RDkit library and the getMolDescriptors function.

In [1]:
from chembl_webresource_client.new_client import new_client

In [2]:
import pandas as pd
target = new_client.target
target_query = target.search('kras')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,GTPase KRas,20.0,False,CHEMBL2189121,"[{'accession': 'P01116', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,VHL/KRAS,18.0,False,CHEMBL5169273,"[{'accession': 'P01116', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
2,[],Homo sapiens,PDE6D/KRAS,17.0,False,CHEMBL4523623,"[{'accession': 'O43924', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,GTPase KRas/RAF1,17.0,False,CHEMBL5291977,"[{'accession': 'P04049', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
4,[],Homo sapiens,RAS,16.0,False,CHEMBL4524006,"[{'accession': 'P01112', 'component_descriptio...",PROTEIN FAMILY,9606
5,[],Homo sapiens,Son of sevenless homolog 1,5.0,False,CHEMBL2079846,"[{'accession': 'Q07889', 'component_descriptio...",SINGLE PROTEIN,9606
6,[],Homo sapiens,Cereblon/SOS1,4.0,False,CHEMBL5291681,"[{'accession': 'Q07889', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
7,[],Homo sapiens,VHL/SOS1,3.0,False,CHEMBL5169070,"[{'accession': 'Q07889', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
8,[],Homo sapiens,SOS1/VHL/ELOC/ELOB,2.0,False,CHEMBL5169077,"[{'accession': 'Q07889', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606


In [3]:
selected_target = targets.target_chembl_id[0]
activity = new_client.activity
res = activity.filter(target_chembl_id=[selected_target]).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

In [4]:
df["canonical_smiles"]

0                           Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN
1                           Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN
2      CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...
3      CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...
4      CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...
                             ...                        
922    COC/C=C/C(=O)N1CC[C@H](n2nnc3c(O[C@@H](C)[C@@H...
923    COC/C=C/C(=O)N1CC[C@H](n2nnc3c(O[C@@H](C)[C@@H...
924    Cc1cccc(-c2c(C)cc3c(nc(N4CC(C)(N(C)C)C4)c4nnn(...
925    COC/C=C/C(=O)N1CC[C@H](n2nnc3c(O[C@@H](C)[C@@H...
926    C=C(F)C(=O)N1CC[C@H](n2nnc3c(O[C@@H](C)[C@@H]4...
Name: canonical_smiles, Length: 927, dtype: object

In [5]:
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,155.0
1,,,13352856,[],CHEMBL2399319,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,342.0
2,,,14548911,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.2
3,,,14548912,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,10.0
4,,,14548913,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.8


In [6]:
from rdkit.Chem import Descriptors
def getMolDescriptors(mol, missingVal=0):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = []
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        if nm != 'Ipc':
            try:
                val = fn(mol)
                res.append(val)
            except:
                return None
        #res[nm] = val
    return res

In [7]:
columnNames = [nm for nm,fn in Descriptors._descList]
descriptors = pd.DataFrame(columns=columnNames)
descriptors = descriptors.drop(columns=["Ipc"])
descriptors.columns

Index(['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex',
       'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt',
       'NumValenceElectrons',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=209)

In [8]:
from rdkit import Chem
for c,i in enumerate(df["canonical_smiles"]):
    t = getMolDescriptors(Chem.MolFromSmiles(i))
    if t is not None and len(t)==209:
        descriptors.loc[c] = t
descriptors.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.17128,6.17128,0.617806,0.617806,0.834426,11.2,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.17128,6.17128,0.617806,0.617806,0.834426,11.2,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.021999,15.021999,0.005149,-1.84623,0.018162,23.08,1396.591,1301.839,1395.709934,544.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,15.494058,15.494058,0.008604,-1.707951,0.013858,22.266055,1499.798,1395.974,1498.781308,582.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
4,15.044508,15.044508,0.005398,-1.719962,0.017958,22.39,1377.612,1284.876,1376.691744,534.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb

import numpy as np

def pIC50(input):
    pIC50 = []
    
    input["standard_value"] = pd.to_numeric(input["standard_value"],errors='coerce')
    
    for i in input["standard_value"]:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))
        
    input['pIC50'] = pIC50
    x = input["pIC50"]
        
    return x

In [10]:
descriptors["standard_value"] = pIC50(df)
descriptors = descriptors[descriptors.standard_value.notna()]

In [11]:
descriptors.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,standard_value
0,6.17128,6.17128,0.617806,0.617806,0.834426,11.2,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.809668
1,6.17128,6.17128,0.617806,0.617806,0.834426,11.2,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.465974
2,15.021999,15.021999,0.005149,-1.84623,0.018162,23.08,1396.591,1301.839,1395.709934,544.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.920819
3,15.494058,15.494058,0.008604,-1.707951,0.013858,22.266055,1499.798,1395.974,1498.781308,582.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.0
4,15.044508,15.044508,0.005398,-1.719962,0.017958,22.39,1377.612,1284.876,1376.691744,534.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.744727


In [12]:
descriptors.to_csv("training_descriptors.csv", index=False)