In [0]:
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')
!pip install xgboost
!pip install scikit-optimize
!pip install scikit-plot

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp -r "/content/drive/My Drive/binding/customs/utils" /content

In [0]:
%tensorflow_version 1.x
import tensorflow as tf
import numpy as np
import pandas as pd
import xgboost as xgb

from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem import AllChem

from sklearn.metrics import roc_auc_score,precision_score,recall_score,accuracy_score

from xgboost import XGBClassifier
from sklearn.metrics import f1_score

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from functools import partial

from utils.data_analysis import calculate_metrics

In [0]:
#Training Data
train_0 = pd.read_csv('/content/drive/My Drive/p38_splits/train_0.csv',index_col = 0)
train_1 = pd.read_csv('/content/drive/My Drive/p38_splits/train_1.csv',index_col = 0)
train_2 = pd.read_csv('/content/drive/My Drive/p38_splits/train_2.csv',index_col = 0)
train_3 = pd.read_csv('/content/drive/My Drive/p38_splits/train_3.csv',index_col = 0)
train_4 = pd.read_csv('/content/drive/My Drive/p38_splits/train_4.csv',index_col = 0)
train_5 = pd.read_csv('/content/drive/My Drive/p38_splits/train_5.csv',index_col = 0)
train_6 = pd.read_csv('/content/drive/My Drive/p38_splits/train_6.csv',index_col = 0)

training_list = [train_0,train_1,train_2,train_3,train_4,train_5,train_6]

#Validation Data
val_0 = pd.read_csv('/content/drive/My Drive/p38_splits/val_0.csv',index_col=0)
val_1 = pd.read_csv('/content/drive/My Drive/p38_splits/val_1.csv',index_col=0)
val_2 = pd.read_csv('/content/drive/My Drive/p38_splits/val_2.csv',index_col=0)
val_3 = pd.read_csv('/content/drive/My Drive/p38_splits/val_3.csv',index_col=0)
val_4 = pd.read_csv('/content/drive/My Drive/p38_splits/val_4.csv',index_col=0)
val_5 = pd.read_csv('/content/drive/My Drive/p38_splits/val_5.csv',index_col=0)
val_6 = pd.read_csv('/content/drive/My Drive/p38_splits/val_6.csv',index_col=0)

validation_list = [val_0,val_1,val_2,val_3,val_4,val_5,val_6]

In [0]:
def xgb_fun(train,val,num_round,opt):

  #Training
  smi_train = train.rdkit
  mols_train = [Chem.MolFromSmiles(smi) for smi in smi_train]
  ECFP_train = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols_train] 
  a_train = np.array(ECFP_train)
  ECFP_smiles_train = a_train.astype(np.float32)

  binary_train = train.Binary

  #Validation
  smi_cold = val.rdkit
  mols_cold = [Chem.MolFromSmiles(smi) for smi in smi_cold]
  ECFP_cold = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols_cold]
  a_cold = np.array(ECFP_cold)
  ECFP_cold = a_cold.astype(np.float32)

  binary_cold = val.Binary

  #XGBoost Matrices
  dtrain = xgb.DMatrix(data = ECFP_smiles_train, label = binary_train)
  dtest = xgb.DMatrix(data = ECFP_cold, label = binary_cold)

  #Parameters
  param = opt
  evallist = [(dtest, 'eval'), (dtrain, 'train')]

  #Train
  bst = xgb.train(param, dtrain, num_round, evallist)

  #Predictions
  ypred = bst.predict(dtest)
  metrics = calculate_metrics(np.array(binary_cold),ypred,plots=True)
  
  return(metrics)

# Hyperparameter tuning

In [0]:
# defining the space
space = [
    Real(0.1, 1.0, name="colsample_bylevel"),
    Real(0.1, 1.0, name="colsample_bytree"),
    Real(0.1, 1.0, name="gamma"),
    Real(0.1, 1.0, name="learning_rate"),
    Real(0.1, 10, name="max_delta_step"),
    Integer(6, 15, name="max_depth"),
    Real(10.0, 500.0, name="min_child_weight"),
    Integer(10, 100, name="n_estimators"),
    Real(0.1, 100, name="reg_alpha"),
    Real(0.1, 100, name="reg_lambda"),
    Real(0.1, 1.0, name="subsample"),
]


In [0]:
# function to fit the model and return the performance of the model
def return_model_assessment(args, X_train, y_train, X_test, y_test):
    global models, train_scores, test_scores, curr_model_hyper_params
    params = {curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params)}
    model = XGBClassifier()
    model.set_params(**params)
    fitted_model = model.fit(X_train, y_train, sample_weight=None)
    models.append(fitted_model)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_score = f1_score(train_predictions, y_train)
    test_score = f1_score(test_predictions, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    return 1 - test_score


In [0]:
#Training
smi_train = train_3.rdkit
mols_train = [Chem.MolFromSmiles(smi) for smi in smi_train]
ECFP_train = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols_train] 
a_train = np.array(ECFP_train)
ECFP_smiles_train = a_train.astype(np.float32)

binary_train = train_3.Binary

#Validation
smi_cold = val_3.rdkit
mols_cold = [Chem.MolFromSmiles(smi) for smi in smi_cold]
ECFP_cold = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols_cold]
a_cold = np.array(ECFP_cold)
ECFP_cold = a_cold.astype(np.float32)

binary_cold = val_3.Binary
models = []
train_scores = []
test_scores = []
curr_model_hyper_params = ['colsample_bylevel', 'colsample_bytree', 
                           'gamma', 'learning_rate', 'max_delta_step','max_depth', 'min_child_weight', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
objective_function = partial(return_model_assessment, X_train=ECFP_smiles_train, y_train=binary_train, X_test=ECFP_cold, y_test=binary_cold)

# running the algorithm
n_calls = 200 # number of times you want to train your model
results = gp_minimize(objective_function, space, base_estimator=None, n_calls=n_calls, n_random_starts=n_calls-1)

import plotly.express as px
metrics = pd.DataFrame(train_scores + test_scores)
metrics.loc[:,'dataset'] = ["train_score"]*n_calls + ["test_score"]*n_calls
metrics.loc[:,'Iteration Number'] = list(range(1,n_calls+1)) + list(range(1,n_calls+1))
metrics.columns = ["F1 Score", "dataset", "Iteration Number"]
fig = px.line(metrics, x="Iteration Number", y="F1 Score", color="dataset")
fig.show()

In [0]:
best_train = metrics['F1 Score'][0:(n_calls-1)].idxmax()
best_test = metrics['F1 Score'][(n_calls-1):(2*n_calls-1)].idxmax()
if best_train>best_test:
  best = results.x_iters[best]
else:
  best = results.x_iters[best-n_calls]

param = {"colsample_bylevel": best[0],"colsample_bytree":best[1],
         "gamma":best[2],"eta":best[3],"max_delta_step":best[4],"max_depth":best[5],
         "min_child_weight":best[6],"n_estimators":best[7],"alpha":best[8],"lambda":best[9],"subsample":best[10],
         'eval_metric':'auc'} #best f1_score p38

del train_scores,test_scores
del metrics,results,objective_function,curr_model_hyper_params
del models

In [0]:
model_evals = []
for i in range(len(training_list)):
  results = xgb_fun(training_list[i],validation_list[i],n_calls,param)
  model_evals.append(results)

model_evals = pd.DataFrame(model_evals)
model_evals.to_csv('/content/results_xgboost.csv')  

