# Small Molecule Design Toolkit

#### Imports

In [1]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
import numpy as np
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import os.path as op
import inspect
from sklearn.ensemble import RandomForestClassifier

#### Functions

In [2]:
def create_activity_dataframe(dataframe):
    """
    Performing useful transformations on the acquired data for use in subsequent algorithm.
    :param dataframe: Dataframe downloaded from NCBI database.
    :return: df: Cleaned and sorted dataframe.
    """

    # Eliminates first five text rows of csv
    for j in range(5):
        df = dataframe.drop(j, axis=0)
    df = df.drop(['PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG',
                  'PUBCHEM_ACTIVITY_SCORE', 'PUBCHEM_SID',
                  'PUBCHEM_ASSAYDATA_COMMENT', 'Potency',
                  'Efficacy', 'Analysis Comment',
                  'Curve_Description', 'Fit_LogAC50',
                  'Fit_HillSlope', 'Fit_R2', 'Fit_ZeroActivity',
                  'Fit_CurveClass', 'Excluded_Points', 'Compound QC',
                  'Max_Response', 'Phenotype', 'Activity at 0.457 uM',
                  'Activity at 2.290 uM', 'Activity at 11.40 uM',
                  'Activity at 57.10 uM', 'PUBCHEM_ACTIVITY_OUTCOME',
                  'Fit_InfiniteActivity'], axis=1)
    df.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True)

    # Eliminates duplicate compound rows
    df['dupes'] = df.duplicated('CID')
    df = df[df['dupes'] == 0].drop(['dupes'], axis=1)
    df = df.sort_values(by='CID')
    return df

In [3]:
def change_nan_infinite(dataframe):
    """
    Replacing NaN and infinite values from the dataframe with zeros.
    :param dataframe: Dataframe containing NaN and infinite values.
    :return data: Data with no NaN or infinite values.
    """

    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = dataframe.fillna(0)

    return data

#### Data Loading and Cleaning

In [4]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'data')

In [5]:
df_smiles = pd.read_csv(op.join(path,'compounds_smiles.txt'),sep="\t",names=["CID", "SMILES"])
response = pd.read_csv(op.join(path,'AID_743255_datatable.csv'))
df_activity = create_activity_dataframe(response)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df = df_activity.merge(df_smiles)
#The two numbers link to two rows where the SMILES strings were faulty and contained unrecognizable characters. 
#So, I decided to drop those rows
df.drop(df.index[[276743, 354142]], inplace=True)
df.sort_values(by='CID', inplace=True)
df.reset_index(drop=True, inplace=True)
TARGET_COLUMN = 'Activity_Score'
# Drop non-descriptor columns before feature space reduction
df_x = df.drop([TARGET_COLUMN, 'CID'], axis=1)
# Creating target column
df_y = df.drop(['SMILES', 'CID'], axis=1)
df_y=pd.to_numeric(df_y['Activity_Score'])
df_y=pd.DataFrame(df_y)

#### Generating Descriptors

In [7]:
mol = [Chem.MolFromSmiles(m) for m in df_x['SMILES']]

In [8]:
d=[AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mol]

In [9]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [10]:
c=rdkit_numpy_convert(d)
Morgan=pd.DataFrame(c)
Morgan.to_csv('MorganFingerprints.csv')

In [13]:
Morgan = change_nan_infinite(Morgan)

In [14]:
Morgan.to_csv('MorganFingerprints.csv')

In [15]:
df_y.to_csv('target.csv')