In [3]:
import pandas as pd

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import DataStructs, Descriptors

import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, make_scorer

import os
from rdkit.Chem import PandasTools
seed = 19111

In [9]:
from multiprocessing import Pool
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, make_scorer, confusion_matrix

In [11]:
NUM_PROC = os.cpu_count() // 1

In [1]:
data_dir = '/home/Development/GAN_testing/A2AR'
all_data = 'A2AR.csv'

In [4]:
data = pd.read_csv(os.path.join(data_dir, all_data))

In [5]:
data.head()

Unnamed: 0,SMILES,label,CMPD_CHEMBLID
0,Brc1c(NC2=NCCN2)ccc2nccnc12,0,CHEMBL844
1,Brc1cccc(Nc2nc3c(N4CCCC4)ncnc3s2)c1,1,CHEMBL3827889
2,Brc1ccccc1,0,CHEMBL16068
3,C#CC1(O)CCC2C3C(C)CC4=C(CCC(=O)C4)C3CCC21C,0,CHEMBL2103774
4,C#CC1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C,0,CHEMBL241694


In [6]:
def morganfp(smi, bits=4096, radius=3):
    mol = Chem.MolFromSmiles(smi)
    if mol is None: return
    vec = np.ndarray((1, bits), dtype=int)
    fp = Chem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bits)
    DataStructs.ConvertToNumpyArray(fp, vec)
    return vec

In [7]:
def qsar_model(df):
    print(df.shape)
    cv = StratifiedKFold(n_splits=5, random_state=seed)
    y = df.label.values
    with Pool(NUM_PROC) as p:
        X = np.array(p.map(morganfp, df.SMILES.values))
    param_grid = {"max_features": [int(np.sqrt(X.shape[1])), 
                               X.shape[1] // 3], 
              "n_estimators": [1000],             
             }
    clf = GridSearchCV(RandomForestClassifier(random_state=seed), 
                  param_grid, n_jobs=-1, cv=cv, verbose=1)
    clf.fit(X, y)
    print(clf.best_score_)
    return clf

In [12]:
model = qsar_model(data)

(4661, 3)
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 15.3min remaining: 10.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 16.6min finished


0.792748337267


### Save model

In [13]:
import sklearn
from sklearn.externals import joblib
print(sklearn.__version__)

0.20.1


In [14]:
rf = model.best_estimator_
file = os.path.join(data_dir, 'A2AR.jbl')
joblib.dump(model, file)

['/home/Development/GAN_testing/A2AR/A2AR.jbl']