In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, Lipinski, Crippen, rdMolDescriptors

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

## Read data, perform some general cleaning and set up small molecule dataset

In [2]:
df1 = pd.read_csv('data/sm_data.csv', sep='";"', engine='python')
df1.rename({'"ChEMBL ID':'ChEMBL ID'}, axis=1, inplace=True)
df1.rename({'Inchi Key"':'InChI Key'}, axis=1, inplace=True)
df1.rename({'Smiles':'SMILES'}, axis=1, inplace=True)

df1['ChEMBL ID'] = df1['ChEMBL ID'].str[1:]
df1['InChI Key'] = df1['InChI Key'].str[:-1]

In [3]:
df1.columns

Index(['ChEMBL ID', 'Name', 'Synonyms', 'Type', 'Max Phase',
       'Molecular Weight', 'Targets', 'Bioactivities', 'AlogP',
       'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Np Likeness Score',
       'Molecular Species', 'Molecular Formula', 'SMILES', 'InChI Key'],
      dtype='object')

In [4]:
#select important columns for data processing and analysis
subset1 = ['ChEMBL ID', 'InChI Key', 'Name', 'SMILES', 'Type', 'Max Phase', 'Targets', 'Bioactivities']

df2 = df1[subset1]
df2 = df2[df2['SMILES'].notna()]
df2.reset_index(drop=True, inplace=True)
df2[['Targets','Bioactivities']] = df2[['Targets','Bioactivities']].fillna(0)

In [5]:
df2['Type'].unique()

array(['Small molecule', 'Protein', 'Oligosaccharide', 'Unknown',
       'Oligonucleotide', 'Antibody'], dtype=object)

In [6]:
#select small molecules only and reset index
df3 = df2.loc[df2['Type']=='Small molecule']
df3.reset_index(drop=True, inplace=True)

## Calculate molecular features based on SMILES code

In [7]:
#function that calculates molecular features based on a SMILES line code
def features(smiles, verbose=False):
    
    #disable warnings, use with caution
    RDLogger.DisableLog('rdApp.*')
    
    try:
        molecule_data= []
        
        for element in smiles:
            mol=Chem.MolFromSmiles(element)
            molecule_data.append(mol)
            

        baseData= np.arange(1,1)
        i=0  
        for mol in molecule_data:        

            parameter_MolWt = Descriptors.MolWt(mol)
            parameter_MolLogP = Descriptors.MolLogP(mol)
            parameter_MolMR = Crippen.MolMR(mol)
            parameter_NumHDonors = Lipinski.NumHDonors(mol)
            parameter_NumHAcceptors = Lipinski.NumHAcceptors(mol)
            parameter_TPSA = rdMolDescriptors.CalcTPSA(mol)
            parameter_HeavyAtomCount = Lipinski.HeavyAtomCount(mol)
            parameter_NumAromaticRings = Lipinski.NumAromaticRings(mol)
            parameter_NumRotatableBonds = Lipinski.NumRotatableBonds(mol)
            parameter_RingCount = Lipinski.RingCount(mol)
            
            row = np.array([parameter_MolWt,
                            parameter_MolLogP,
                            parameter_MolMR,
                            parameter_NumHDonors,
                            parameter_NumHAcceptors,
                            parameter_TPSA,
                            parameter_HeavyAtomCount,
                            parameter_NumAromaticRings, 
                            parameter_NumRotatableBonds,
                            parameter_RingCount])   

            if(i==0):
                baseData=row
            else:
                baseData=np.vstack([baseData, row])
            i=i+1      

        columnNames=['MW','LogP','MolMR','H Donors','H Acceptors', 'TPSA',
                     'Heavy Atom Count', 'Aromatic Rings', 'Rotatable Bonds', 'Ring Count']   
        parameters = pd.DataFrame(data=baseData,columns=columnNames)
        
    except:
        print('Check for NaN values in the smiles column')

    return parameters

In [8]:
#run features() function on SMILES codes and combine the calculated data with the existing dataset
df_features = features(df3.SMILES)
df_features.dropna(inplace=True)
df4 = pd.concat([df3, df_features], axis=1)

In [9]:
#save dataset
df4.to_csv('data/sm_data_features.csv', index=False)