### Imports

In [5]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

from rdkit import Chem, DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn import svm
from sklearn.metrics import recall_score, roc_auc_score

### Helper functions

In [6]:
class FP:
    """
    A fingerprint class that inserts molecular fingerprints into pandas data frame
    """
    def __init__(self, fp):
        self.fp = fp
    def __str__(self):
        return "%d bit FP" % len(self.fp)
    def __len__(self):
        return len(self.fp)

def get_morgan_fp(mol):
    """
    Returns the RDKit Morgan fingerprint for a molecule
    """
    info = {}
    arr = np.zeros((1,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=False, bitInfo=info)
    DataStructs.ConvertToNumpyArray(fp, arr)
    arr = np.array([len(info[x]) if x in info else 0 for x in range(1024)])

    return FP(arr)

### Data preparation

In [20]:
# The data file should contain three columns 
# 1. molecule ID;
# 2. canonical SMILES; and 
# 3. activity (which is either 1 or 0)

# reading the data file into a pandas data frame
df = pd.read_csv("publications/external/E3_training.csv", index_col=0)

# Build ROMol objects 
PandasTools.AddMoleculeColumnToFrame(df, smilesCol='can_smiles')

# Remove molecules that could not be parsed from SMILES
df = df[~df.ROMol.isnull()]

# Calculate fingerprints and store them in df
# Note: if additional fingerprints are needed that are not available in RDkit, they must be imported with the data
df['fp'] = df.apply(lambda x: get_morgan_fp(x['ROMol']), axis=1)
#df.shape

### Defining X and Y

In [21]:
# create X variable (=features i.e. molecular fingerprints)
X = np.array([x.fp for x in df.fp])
#X.shape

# create Y variable (=activity values i.e. blocker;1 or non-blocker;0)
y = np.array(df.ac)
#y.shape

### Cross Validation

In [22]:
# Initialize performance measures
sens     = np.array([])
spec     = np.array([])
auc      = np.array([])

# 10-fold cross-validation split
kfolds = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)

for train, test in kfolds:
    # Split data to training and test set
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]    

    # if undersampling is required - random undersampling of the majority class can be done as follows
    
    # undersampling training set
    #uSampler = RandomUnderSampler(ratio=1., replacement=False)
    #X_train, y_train = uSampler.fit_sample(X_train, y_train)
    
    # undersampling test set
    #uSampler = RandomUnderSampler(ratio=1., replacement=False)
    #X_test, y_test = uSampler.fit_sample(X_test, y_test)
    
    # Training an SVM classifier
    svm_clf = svm.SVC(C=1.0, coef0=0.0, degree=3, gamma='auto', kernel='linear', probability=True, shrinking=True)
    svm_clf.fit(X_train, y_train)
    
    # Predicting the test set
    y_pred       = svm_clf.predict(X_test)
    y_pred_proba = svm_clf.predict_proba(X_test).T[1]
    
    # Append performance measures
    auc  = np.append(auc, roc_auc_score(y_test, y_pred_proba))
    sens = np.append(sens, recall_score(y_test, y_pred, pos_label=1))
    spec = np.append(spec, recall_score(y_test, y_pred, pos_label=0))
    
    
# 10-fold cross-validation performance
print('AUC:\t\t\t%.2f +/- %.2f' % (auc.mean(), auc.std()))
print('Sensitivity:\t\t%.2f +/- %.2f' % (sens.mean(), sens.std()))
print('Specificity:\t\t%.2f +/- %.2f' % (spec.mean(), spec.std()))

AUC:			0.84 +/- 0.12
Sensitivity:		0.47 +/- 0.20
Specificity:		0.92 +/- 0.03
