In [4]:
# loading library
import os, sys, csv

# multiprocessing module in python
import signal
import time
import multiprocessing

# rdkit module
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.ML.Descriptors import Descriptors
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors3D
from rdkit.Chem import Lipinski
from rdkit.Chem.rdPartialCharges import ComputeGasteigerCharges
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator


# import scikitlearn lib
import pandas as pd 
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
# Removing features with low variance (automatically remove zero and certain threshold)
from sklearn.feature_selection import VarianceThreshold
# Univariate feature selection
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import chi2
# Tree based feature elimination
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
# recursive feature elimination
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV # search best hyper-parameters
from sklearn.model_selection import train_test_split

In [31]:
# load chemistry related functions based on rdkit

# convert rdkit molobject with 3d structure
def Get3DMolFromMol(molObject):
    try:
        m3 = Chem.AddHs(molObject)
        AllChem.EmbedMolecule(m3)
        m3 = Chem.RemoveHs(m3)
        return m3
    except Exception as error:
        print(error)
    
    return None

# get molObject from smiles
def GetMolFromSmiles(smiles):
    try:
        m2 = Chem.MolFromSmiles(smiles)
        return m2
    except Exception as error:
        print(error)        
    return None

# get all rdkit chem descriptor name 
def GetDescriptorName():
    descriptorName = ['BalabanJ','BertzCT','Ipc','HallKierAlpha','Kappa1',
                    'Kappa2','Kappa3','Chi0', 'Chi1','Chi0n','Chi1n','Chi2n','Chi3n',
                    'Chi4n','Chi0v','Chi1v','Chi2v','Chi3v','Chi4v','MolLogP','MolMR',
                    'MolWt','ExactMolWt','HeavyAtomCount','HeavyAtomMolWt','NHOHCount',
                    'NOCount','NumHAcceptors','NumHDonors','NumHeteroatoms','NumRotatableBonds',
                    'NumValenceElectrons','NumAromaticRings','NumSaturatedRings',
                    'NumAliphaticRings','NumAromaticHeterocycles','NumSaturatedHeterocycles',
                    'NumAliphaticHeterocycles','NumAromaticCarbocycles','NumSaturatedCarbocycles',
                    'NumAliphaticCarbocycles','RingCount','FractionCSP3','TPSA','LabuteASA',]

    for i in range(1,15):
        descriptorName.append('PEOE_VSA{0}'.format(i))
    for i in range(1,11):
        descriptorName.append('SMR_VSA{0}'.format(i))
    for i in range(1,13):
        descriptorName.append('SlogP_VSA{0}'.format(i))
    for i in range(1,12):
        descriptorName.append('EState_VSA{0}'.format(i))
    for i in range(1,11):
        descriptorName.append('VSA_EState{0}'.format(i))
        
    return descriptorName


def Get3dDescriptorNames():
    descriptorNames = ["asphericity","eccentricity","inertialshapefactor","npr1","npr2","pmi1","pmi2","pmi3",
                       "radiusofgyration","spherocityindex","calcpdf"]
    autocorr3d = 80
    for i in range(1,autocorr3d+1):
        descriptorNames.append("autocorr3d_{0}".format(i))

    rdf = 210
    for i in range(1,rdf+1):
        descriptorNames.append("rdf_{0}".format(i))
    
    morse = 224
    for i in range(1,morse+1):
        descriptorNames.append("morse_{0}".format(i))
    
    whim = 114
    for i in range(1,whim+1):
        descriptorNames.append("whim_{0}".format(i))
        
    getaway = 273
    for i in range(1,getaway+1):
        descriptorNames.append("getaway_{0}".format(i))
        
    return descriptorNames
    
# get 3d descriptor values (without getting the descriptor names)
def Get3dDescriptor(molObject):
    value_list = []
    asphericity = Descriptors3D.Asphericity(molObject)   # single float 
    eccentricity = Descriptors3D.Eccentricity(molObject)   # single float
    inertialshapefactor = Descriptors3D.InertialShapeFactor(molObject)  # single float
    npr1 = Descriptors3D.NPR1(molObject)  # single float
    npr2 = Descriptors3D.NPR2(molObject)  # single float
    pmi1 = Descriptors3D.PMI1(molObject)  # single float
    pmi2 = Descriptors3D.PMI2(molObject)  # single float
    pmi3 = Descriptors3D.PMI3(molObject)  # single float
    radiusofgyration = Descriptors3D.RadiusOfGyration(molObject)  #radius of gyration   # single float
    spherocityindex = Descriptors3D.SpherocityIndex(molObject)    # single float
    pdf = rdMolDescriptors.CalcPBF(molObject)    # Returns the PBF (plane of best fit) descriptor   # single float
    
    # some reference about 3d descriptor which is solely based on 3d structureo of molecule
    # be careful about the 3d structure because the descriptor are solely based on them
    # https://www.researchgate.net/publication/313178046_Molecular_Descriptors
    # http://match.pmf.kg.ac.rs/electronic_versions/Match45/match45_27-33.pdf
    # http://joao.airesdesousa.com/qc/chapt3.pdf
    # sum up: 3d descriptor may give potential information.
    autocorr3d = rdMolDescriptors.CalcAUTOCORR3D(molObject)  # return 80 values 
    rdf = rdMolDescriptors.CalcRDF(molObject)  # return 210 values
    morse = rdMolDescriptors.CalcMORSE(molObject) # return 224 values 
    whim = rdMolDescriptors.CalcWHIM(molObject)  # return 114 values 
    getaway = rdMolDescriptors.CalcGETAWAY(molObject) # return 273 values 
    value_list = [asphericity,eccentricity,inertialshapefactor,npr1,npr2,pmi1,pmi2,pmi3,radiusofgyration,
                 spherocityindex,pdf,autocorr3d,rdf,morse,whim,getaway]

    return value_list

# calculate number of amidebonds, number of spiro atoms, number of bridge head atoms and MQNs_
def CalculateStandAloneDescriptor(molObject):
    value_list = []

    if AllChem.ComputeGasteigerCharges(molObject) == None:
        value_list.append(0.0)
    else:
        value_list.append(1.0)

    value_list.append(rdMolDescriptors.CalcNumAmideBonds(molObject))
    value_list.append(rdMolDescriptors.CalcNumSpiroAtoms(molObject))
    value_list.append(rdMolDescriptors.CalcNumBridgeheadAtoms(molObject))
    value_list += rdMolDescriptors.MQNs_(molObject)

    return value_list

# Get MACC 166 fingerprint 
def GetMACCFP(molObject):
    maccs = MACCSkeys.GenMACCSKeys(molObject)
    maccs_fingerprint = []
    for i in range(len(maccs)):
        maccs_fingerprint.append(maccs[i])

    return maccs_fingerprint


# get desriptor value based on descriptor name
def GetMolecularDescriptor(molObject,descriptorName):
    calc = MolecularDescriptorCalculator(descriptorName)
    descrs = calc.CalcDescriptors(molObject)
    return list(descrs)

# join descriptor values from 3d descriptor due to special condition
def join3Ddescriptors(descriptorValues3d):
    values = []
    for d in descriptorValues3d:
        if type(d) is float:
            values.append(d)
        else:
            for v in d:
                values.append(v)
                
    return values


In [25]:
# testing for chem function
smiles = "[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@]4(C)[C@@]3([H])CC[C@]12C)[C@H](C)CCCC(C)C"
molObject = GetMolFromSmiles(smiles)
print("molObject = {0}".format(molObject))

molObject3D = Get3DMolFromMol(molObject)
print("molObject3D = {0}".format(molObject3D))

molObject = <rdkit.Chem.rdchem.Mol object at 0x000002329DC89850>
molObject3D = <rdkit.Chem.rdchem.Mol object at 0x000002329DC892B0>


In [30]:
descriptorNames = GetDescriptorName()
# print(descriptorNames)
print("length of descriptor Names = {0}".format(len(descriptorNames)))
descriptorValues = GetMolecularDescriptor(molObject3D,descriptorNames)
# print(descriptorValues)
print("length of descriptorValues = {0}".format(len(descriptorValues)))

length of descriptor Names = 102
length of descriptorValues = 102


In [36]:
descriptorNames3d = Get3dDescriptorNames()
print("length of descriptor3d Names = {0}".format(len(descriptorNames3d)))
descriptorValues3d = Get3dDescriptor(molObject3D)
descriptorValues3d = join3Ddescriptors(descriptorValues3d)
print("length of descriptor3d values = {0}".format(len(descriptorValues3d)))

length of descriptor3d Names = 912
length of descriptor3d values = 912
