# Small Molecule Design Toolkit

#### Imports

In [18]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
import numpy as np
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import os.path as op
import inspect
from sklearn.ensemble import RandomForestClassifier

#### Functions

In [2]:
def create_activity_dataframe(dataframe):
    """
    Performing useful transformations on the acquired data for use in subsequent algorithm.
    :param dataframe: Dataframe downloaded from NCBI database.
    :return: df: Cleaned and sorted dataframe.
    """

    # Eliminates first five text rows of csv
    for j in range(5):
        df = dataframe.drop(j, axis=0)
    df = df.drop(['PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG',
                  'PUBCHEM_ACTIVITY_SCORE', 'PUBCHEM_SID',
                  'PUBCHEM_ASSAYDATA_COMMENT', 'Potency',
                  'Efficacy', 'Analysis Comment',
                  'Curve_Description', 'Fit_LogAC50',
                  'Fit_HillSlope', 'Fit_R2', 'Fit_ZeroActivity',
                  'Fit_CurveClass', 'Excluded_Points', 'Compound QC',
                  'Max_Response', 'Phenotype', 'Activity at 0.457 uM',
                  'Activity at 2.290 uM', 'Activity at 11.40 uM',
                  'Activity at 57.10 uM', 'PUBCHEM_ACTIVITY_OUTCOME',
                  'Fit_InfiniteActivity'], axis=1)
    df.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True)

    # Eliminates duplicate compound rows
    df['dupes'] = df.duplicated('CID')
    df = df[df['dupes'] == 0].drop(['dupes'], axis=1)
    df = df.sort_values(by='CID')
    return df

In [21]:
def change_nan_infinite(dataframe):
    """
    Replacing NaN and infinite values from the dataframe with zeros.
    :param dataframe: Dataframe containing NaN and infinite values.
    :return data: Data with no NaN or infinite values.
    """

    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = dataframe.fillna(0)

    return data

#### Data Loading and Cleaning

In [3]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'data')

In [4]:
df_smiles = pd.read_csv(op.join(path,'compounds_smiles.txt'),sep="\t",names=["CID", "SMILES"])
response = pd.read_csv(op.join(path,'AID_743255_datatable.csv'))
df_activity = create_activity_dataframe(response)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df = df_activity.merge(df_smiles)
#The two numbers link to two rows where the SMILES strings were faulty and contained unrecognizable characters. 
#So, I decided to drop those rows
df.drop(df.index[[276743, 354142]], inplace=True)
df.sort_values(by='CID', inplace=True)
df.reset_index(drop=True, inplace=True)
TARGET_COLUMN = 'Activity_Score'
# Drop non-descriptor columns before feature space reduction
df_x = df.drop([TARGET_COLUMN, 'CID'], axis=1)
# Creating target column
df_y = df.drop(['SMILES', 'CID'], axis=1)
df_y=pd.to_numeric(df_y['Activity_Score'])
df_y=pd.DataFrame(df_y)

#### Generating Descriptors

In [6]:
mol = [Chem.MolFromSmiles(m) for m in df_x['SMILES']]

In [7]:
d=[AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mol]

In [8]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [9]:
c=rdkit_numpy_convert(d)

In [26]:
a=pd.DataFrame(c)

In [None]:
a.head()

In [12]:
np.sum(df_y['Activity_Score'])/len(df_y['Activity_Score'])

0.78195801395427633

#### Set random seed to make all further calculations reproducible

In [13]:
seed=42

#### Split the whole set on training and test sets

In [22]:
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(c, df_y, test_size=0.20, random_state=seed)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [23]:
x_tr = change_nan_infinite(x_tr)
y_tr = change_nan_infinite(y_tr)
x_ts = change_nan_infinite(x_ts)
y_ts = change_nan_infinite(y_ts)

AttributeError: 'numpy.ndarray' object has no attribute 'replace'

#### Create folds for cross-validation

In [15]:
cv = StratifiedKFold(n_splits=5, random_state=seed)

#### Search for optimal tuning parameters and build the model

In [16]:
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [19]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [20]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').