# Small Molecule Design Toolkit

#### Imports

In [3]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
import numpy as np
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import os.path as op
import inspect
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

#### Data

In [2]:
df_x = pd.read_csv('SMILES.csv')
df_y = pd.read_csv('Activity.csv')
df_y.drop(['Unnamed: 0'],axis=1,inplace=True)
df_x.drop(['Unnamed: 0'],axis=1,inplace=True)

#### Generating Morgan Fingerprints

In [4]:
mol = [Chem.MolFromSmiles(m) for m in df_x['SMILES']]
GetMorgan = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mol]
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)
Morgan=pd.DataFrame(rdkit_numpy_convert(GetMorgan))

In [6]:
Morgan.to_csv('MorganFingerprints.csv')

#### Functions

In [None]:
def change_nan_infinite(dataframe):
    """
    Replacing NaN and infinite values from the dataframe with zeros.
    :param dataframe: Dataframe containing NaN and infinite values.
    :return data: Data with no NaN or infinite values.
    """

    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = dataframe.fillna(0)

    return data

#### Set random seed to make all further calculations reproducible

In [32]:
seed=42

#### Split the whole set on training and test sets

In [33]:
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(Morgan, df_y, test_size=0.20, random_state=seed)

In [42]:
x_train = change_nan_infinite(x_tr)
y_train = change_nan_infinite(y_tr)
x_test = change_nan_infinite(x_ts)
y_test = change_nan_infinite(y_ts)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Random Forests

In [44]:
m = RandomForestRegressor()

In [45]:
m.fit(x_train,y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [46]:
y_pred = m.predict(x_test)

In [47]:
r2_score(y_pred,y_test)

-2.1231462006429616