In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

df1 = pd.read_csv('data/MMP9_sm_activity_proc1.csv')

In [2]:
#function that calculates lipinski values based on a smiles file
def lipinski(smiles, verbose=False):

    try:
        molecule_data= []
        for element in smiles:
            mol=Chem.MolFromSmiles(element) 
            molecule_data.append(mol)

        baseData= np.arange(1,1)
        i=0  
        for mol in molecule_data:        

            paramater_MolWt = Descriptors.MolWt(mol)
            paramater_MolLogP = Descriptors.MolLogP(mol)
            paramater_NumHDonors = Lipinski.NumHDonors(mol)
            paramater_NumHAcceptors = Lipinski.NumHAcceptors(mol)

            row = np.array([paramater_MolWt,
                            paramater_MolLogP,
                            paramater_NumHDonors,
                            paramater_NumHAcceptors])   

            if(i==0):
                baseData=row
            else:
                baseData=np.vstack([baseData, row])
            i=i+1      

        columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
        parameters = pd.DataFrame(data=baseData,columns=columnNames)
        
    except:
        print('Check for NaN values in the smiles column')

    return parameters


#normalize IC50 values to 100000000 or below
def normalize(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input
        
    return x
     

#function that converts IC50 value to pIC50 for better data handling
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) #convert nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x


In [3]:
df_lipinski = lipinski(df1.canonical_smiles)

In [4]:
df2 = pd.concat([df1, df_lipinski], axis=1)

In [5]:
df_norm = normalize(df2)
df3 = pIC50(df_norm)
df3.to_csv('data/MMP9_sm_activity_proc2.csv', index=False)

  pIC50.append(-np.log10(molar))
  x = input.drop('standard_value_norm', 1)


In [6]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,activity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL80814,COc1ccc(CCN2CCC(C(=O)NO)(S(=O)(=O)c3ccc(OC)cc3...,34.00,active,448.541000,2.06020,2.0,7.0,7.468521
1,CHEMBL276119,COc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3cccc(OC)c3)...,9.00,active,434.514000,2.01770,2.0,7.0,8.045757
2,CHEMBL78934,CCCCOc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3ccc(Cl)c...,20.00,active,515.459000,4.48620,2.0,6.0,7.698970
3,CHEMBL78369,O=C(NO)C1(S(=O)(=O)c2ccc(OCc3ccccc3)cc2)CCN(Cc...,3.00,active,480.586000,3.57950,2.0,6.0,8.522879
4,CHEMBL514138,COc1ccc(S(=O)(=O)N(Cc2cccnc2)[C@@H](C(=O)NO)C(...,9.00,active,429.926000,2.23290,2.0,6.0,8.045757
...,...,...,...,...,...,...,...,...,...
2363,CHEMBL4854379,CNC(=O)C(Cc1ccccc1)NC(=O)C(CC(C)C)SC1CCCC1S,0.57,active,408.633000,3.45870,3.0,4.0,9.244125
2364,CHEMBL4872688,CC(C)c1ccc(-n2nnc(-c3ccc(S(=O)(=O)NCC(=O)O)cc3...,110.00,active,401.448000,1.81560,2.0,7.0,6.958607
2365,CHEMBL4869361,[2H]C([2H])(NC(=O)c1cc(C(=O)NCc2ccc(F)c(C)c2)n...,74000.00,inactive,424.428204,2.48232,3.0,5.0,4.130768
2366,CHEMBL4854306,Cc1ccc(Cl)cc1N1CCN(C(=O)CCC2(C3CC3)NC(=O)NC2=O...,20000.00,inactive,404.898000,2.06542,2.0,4.0,4.698970
