In [1]:
import tensorflow as tf
import xgboost as xgb
import pandas as pd
import functools
import time
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, \
    auc, average_precision_score, pairwise_distances
import scikitplot as skplt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgb_hyper import objective
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import dill
from functools import partial
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
import xgboost as xgb

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)
print("The following GPU devices ara available: %s"%tf.test.gpu_device_name())

The following GPU devices ara available: /device:GPU:0


In [3]:
def calculate_metrics(y_true, y_pred, plots=False):
    assert isinstance(y_true, np.ndarray), 'y_true should be np.array'
    assert len(y_true.shape) == len(y_pred.shape) == 1, 'y_true or y_pred shapes are not 1 (probably not squeezed)'
    y_pred_bin = y_pred > 0.5

    cf = confusion_matrix(y_true, y_pred_bin)
    tn, fp, fn, tp = cf.ravel()

    metrics = {
        'roc_auc': roc_auc_score(y_true, y_pred),
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'map': average_precision_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred_bin),
        'recall': recall_score(y_true, y_pred_bin),
        'accuracy': accuracy_score(y_true, y_pred_bin),
    }

    if plots:
        print('predictions histogram')
        plt.figure()
        plt.hist(y_pred, bins=int(len(y_pred) / 3))
        plt.show()

        print('confusion matrix')
        plt.figure()
        group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        group_counts = ['{0:0.0f}'.format(value) for value in
                        cf.flatten()]
        group_percentages = ['{0:.2%}'.format(value) for value in
                             cf.flatten() / np.sum(cf)]
        labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
                  zip(group_names, group_counts, group_percentages)]
        labels = np.asarray(labels).reshape(2, 2)
        sns.heatmap(cf, annot=labels, fmt='', cmap='Blues')
        plt.show()

        print('roc curve')
        random_probs = [0 for _ in range(len(y_true))]
        auc = roc_auc_score(y_true, y_pred)
        print('Logistic: ROC AUC=%.3f' % (auc))
        ns_fpr, ns_tpr, _ = roc_curve(y_true, random_probs)
        lr_fpr, lr_tpr, _ = roc_curve(y_true, y_pred)
        plt.plot(ns_fpr, ns_tpr, linestyle='--', label='random')
        plt.plot(lr_fpr, lr_tpr, marker='.')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend()
        plt.show()

    return metrics


In [4]:
target_1 = 'pi3k'
base_path_1 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_1 = base_path_1+f'/data/{target_1}/data.csv'
df_p38=pd.read_csv(data_fpath_1).set_index('biolab_index')

with open(base_path_1+f'/data/{target_1}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_p38 = dill.load(in_f)

with open(base_path_1+f'/data/{target_1}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_p38 = dill.load(in_f)
    
target_2 = 'akt1'
base_path_2 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_2 = base_path_2+f'/data/{target_2}/data.csv'
df_akt1 = pd.read_csv(data_fpath_2).set_index('biolab_index')

with open(base_path_2+f'/data/{target_2}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_akt1 = dill.load(in_f)
with open(base_path_2+f'/data/{target_2}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_akt1 = dill.load(in_f)
    
target_3 = 'pi3k'
base_path_3 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_3 = base_path_3+f'/data/{target_3}/data.csv'
df_pi3k = pd.read_csv(data_fpath_3).set_index('biolab_index')

with open(base_path_3+f'/data/{target_3}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_pi3k = dill.load(in_f)
with open(base_path_3+f'/data/{target_3}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_pi3k = dill.load(in_f)

In [5]:
#Evaluation Splits (our test set)
training_p38 = df_p38.loc[train_test_folds_p38[0]]
validation_p38 = df_p38.loc[train_test_folds_p38[1]]


training_akt1 = df_akt1.loc[train_test_folds_akt1[0]]
validation_akt1 = df_akt1.loc[train_test_folds_akt1[1]]
               

training_pi3k = df_pi3k.loc[train_test_folds_pi3k[0]]
validation_pi3k = df_pi3k.loc[train_test_folds_pi3k[1]]

In [6]:
#AVE Bias splits (test) only p38
ave_p38_train = pd.read_csv('data/p38/split_aveb/train_all.csv', index_col=0)
ave_p38_val = pd.read_csv('data/p38/split_aveb/test.csv', index_col = 0)
print(len(ave_p38_train),len(ave_p38_val))

3186 371


In [7]:
#Random splits with sklearn (on our test set)
df_p38 = df_p38.reset_index(drop=True)
X_train_p38, X_val_p38, Y_train_p38, Y_val_p38 = train_test_split(df_p38.rdkit,
                                                                  df_p38.Binary,
                                                                  test_size = 0.15,
                                                                  train_size = 0.85,
                                                                  shuffle = True)
X_train_p38 = pd.DataFrame(X_train_p38)
X_val_p38 = pd.DataFrame(X_val_p38)
print(len(X_train_p38),len(X_val_p38))

3190 564


In [8]:
df_akt1 = df_akt1.reset_index(drop=True)
X_train_akt1, X_val_akt1, Y_train_akt1, Y_val_akt1 = train_test_split(df_akt1.rdkit,
                                                                     df_akt1.Binary,
                                                                     test_size = 0.15,
                                                                     train_size = 0.85,
                                                                     shuffle = True)
X_train_akt1 = pd.DataFrame(X_train_akt1)
X_val_akt1 = pd.DataFrame(X_val_akt1)
print(len(X_train_akt1),len(X_val_akt1))

1819 321


In [9]:
df_pi3k = df_pi3k.reset_index(drop=True)
X_train_pi3k, X_val_pi3k, Y_train_pi3k, Y_val_pi3k = train_test_split(df_pi3k.rdkit,
                                                                      df_pi3k.Binary,
                                                                      test_size = 0.15,
                                                                      train_size = 0.85,
                                                                      shuffle = True)
X_train_pi3k = pd.DataFrame(X_train_pi3k)
X_val_pi3k = pd.DataFrame(X_val_pi3k)
print(len(X_train_pi3k),len(X_val_pi3k))

3190 564


In [10]:
# defining the space
fspace = {
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0.1, 1), #+
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1), #+
    'gamma' : hp.uniform('gamma', 0.1, 1), #+
    'learning_rate' : hp.uniform('learning_rate', 0.1, 1),
    'max_delta_step' : hp.quniform('max_delta_step',1,10,1),
    'max_depth' : hp.quniform('max_depth',6, 12, 1),
    'min_child_weight' : hp.quniform('min_child_weight',10 ,500 ,5),
    'reg_alpha' : hp.uniform('reg_alpha',0.1,100),
    'reg_lambda' : hp.uniform('reg_lambda',0.1,100),
    'subsample' : hp.uniform('subsample',0.1,1.0),
    'max_bin' : hp.quniform('max_bin',16,256,16)
    # add sampling method,max bin,predicto,monotone_constraints,interaction_constraints,single_precision_histogram
}

In [11]:
fmin_objective = partial(objective, train_sets = training_p38, val_sets = validation_p38)

In [12]:
def run_trials():

    trials_step = 0  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("xgboost.hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn = fmin_objective, space = fspace, algo=tpe.suggest, max_evals=max_trials, trials=trials)

    print("Best:", best)
    
    # save the trials object
    with open("xgboost.hyperopt", "wb") as f:
        pickle.dump(trials, f)
    return(trials)

In [13]:
trials = run_trials()

Found saved Trials! Loading...
Rerunning from 510 trials to 510 (+0) trials
100%|████████████████████████████████████████████████████████████████████████| 510/510 [00:00<?, ?trial/s, best loss=?]
Best: {'colsample_bylevel': 0.5612301667238877, 'colsample_bytree': 0.788688363076523, 'gamma': 0.35376030016117566, 'learning_rate': 0.4023692255888918, 'max_bin': 16.0, 'max_delta_step': 3.0, 'max_depth': 8.0, 'min_child_weight': 70.0, 'reg_alpha': 0.15030685758880047, 'reg_lambda': 15.311721955443915, 'subsample': 0.8303923929525608}


In [14]:
best_loss = trials.trials[0]['result']['loss']
for i in range(1,len(trials.trials)):
    if (trials.trials[i]['result']['loss'] <=  best_loss):
        best_loss = trials.trials[i]['result']['loss']
        index = i

    
best_params = trials.trials[index]['misc']['vals']
hyper_params = {
        "colsample_bylevel" : best_params['colsample_bylevel'][0],
        "colsample_bytree" : best_params['colsample_bytree'][0],
        "gamma" : best_params['gamma'][0],
        "eta" : best_params['learning_rate'][0],
        "max_delta_step" : int(best_params['max_delta_step'][0]),
        "max_depth" : int(best_params['max_depth'][0]),
        "min_child_weight" : int(best_params['min_child_weight'][0]),
        "alpha" : best_params['reg_alpha'][0],
        "lambda" : best_params['reg_lambda'][0],
        "subsample" : best_params['subsample'][0],
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree',
        "tree_method" : 'gpu_hist',
        "single_precision_histogram" : True
}
from xgb_hyper import XGB_hyper
class_xgb = XGB_hyper(hyper_params)

In [15]:
train_list_p38 = [training_p38, ave_p38_train, X_train_p38]
val_list_p38 = [validation_p38, ave_p38_val, X_val_p38]

train_list_akt1 = [training_akt1, X_train_akt1]
val_list_akt1 = [validation_akt1, X_val_akt1]

train_list_pi3k = [training_pi3k, X_train_pi3k]
val_list_pi3k = [validation_pi3k, X_val_pi3k]

In [16]:
eval_p38 = {}
for i in range(len(train_list_p38)):
    if i == 2:
        smi = train_list_p38[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        train_labels = Y_train_p38
        dmatrix_train = xgb.DMatrix(data = ECFP_smiles,label = train_labels)
        
        smi = val_list_p38[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        val_labels = Y_val_p38
        dmatrix_val = xgb.DMatrix(data = ECFP_smiles,label = val_labels)
        del smi,mols,ECFP,a,ECFP_smiles
        
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    else:
        dmatrix_train,train_labels = class_xgb.to_xgb_input(train_list_p38[i])
        dmatrix_val,val_labels = class_xgb.to_xgb_input(val_list_p38[i])
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    
    y_pred_val = model.predict(dmatrix_val)
    if i == 0:
        eval_p38['Test'] = calculate_metrics(np.array(val_labels),y_pred_val)
    elif i == 1:
        eval_p38['Ave'] = calculate_metrics(np.array(val_labels),y_pred_val)
    elif i == 2:
        eval_p38['Random'] = calculate_metrics(np.array(val_labels),y_pred_val)

[0]	eval-auc:0.72200	train-auc:0.70592
[1]	eval-auc:0.68301	train-auc:0.74260
[2]	eval-auc:0.69628	train-auc:0.76914
[3]	eval-auc:0.69529	train-auc:0.78318
[4]	eval-auc:0.73879	train-auc:0.78736
[5]	eval-auc:0.71432	train-auc:0.80538
[6]	eval-auc:0.72304	train-auc:0.80939
[7]	eval-auc:0.72648	train-auc:0.81340
[8]	eval-auc:0.74481	train-auc:0.81878
[9]	eval-auc:0.75756	train-auc:0.82219
[10]	eval-auc:0.76321	train-auc:0.82756
[11]	eval-auc:0.77043	train-auc:0.83228
[12]	eval-auc:0.76563	train-auc:0.83384
[13]	eval-auc:0.75301	train-auc:0.83662
[14]	eval-auc:0.76732	train-auc:0.83935
[15]	eval-auc:0.77700	train-auc:0.84346
[16]	eval-auc:0.77670	train-auc:0.84503
[17]	eval-auc:0.77566	train-auc:0.84694
[18]	eval-auc:0.77779	train-auc:0.85008
[19]	eval-auc:0.78445	train-auc:0.85382
[20]	eval-auc:0.78817	train-auc:0.85549
[21]	eval-auc:0.79499	train-auc:0.85830
[22]	eval-auc:0.79734	train-auc:0.85938
[23]	eval-auc:0.79707	train-auc:0.86167
[24]	eval-auc:0.79722	train-auc:0.86485
[25]	eval-

[203]	eval-auc:0.86484	train-auc:0.92273
[204]	eval-auc:0.86407	train-auc:0.92266
[205]	eval-auc:0.86165	train-auc:0.92294
[206]	eval-auc:0.86214	train-auc:0.92298
[207]	eval-auc:0.85964	train-auc:0.92307
[208]	eval-auc:0.86035	train-auc:0.92327
[209]	eval-auc:0.86176	train-auc:0.92366
[210]	eval-auc:0.86229	train-auc:0.92377
[211]	eval-auc:0.86227	train-auc:0.92380
[212]	eval-auc:0.86188	train-auc:0.92384
[213]	eval-auc:0.86092	train-auc:0.92397
[214]	eval-auc:0.86199	train-auc:0.92414
[215]	eval-auc:0.86205	train-auc:0.92434
[216]	eval-auc:0.86132	train-auc:0.92441
[217]	eval-auc:0.86307	train-auc:0.92464
[218]	eval-auc:0.86268	train-auc:0.92482
[219]	eval-auc:0.86197	train-auc:0.92489
[220]	eval-auc:0.86332	train-auc:0.92493
[221]	eval-auc:0.86369	train-auc:0.92509
[222]	eval-auc:0.86287	train-auc:0.92524
[223]	eval-auc:0.86392	train-auc:0.92541
[224]	eval-auc:0.86310	train-auc:0.92558
[225]	eval-auc:0.86375	train-auc:0.92568
[226]	eval-auc:0.86383	train-auc:0.92548
[227]	eval-auc:0

[106]	eval-auc:0.76079	train-auc:0.90677
[107]	eval-auc:0.76519	train-auc:0.90749
[108]	eval-auc:0.76512	train-auc:0.90767
[109]	eval-auc:0.76414	train-auc:0.90805
[110]	eval-auc:0.76526	train-auc:0.90847
[111]	eval-auc:0.76811	train-auc:0.90894
[112]	eval-auc:0.76579	train-auc:0.90913
[113]	eval-auc:0.76723	train-auc:0.90949
[114]	eval-auc:0.76519	train-auc:0.90963
[115]	eval-auc:0.76480	train-auc:0.90962
[116]	eval-auc:0.76699	train-auc:0.90978
[117]	eval-auc:0.76833	train-auc:0.91007
[118]	eval-auc:0.76734	train-auc:0.91006
[119]	eval-auc:0.76794	train-auc:0.91067
[120]	eval-auc:0.76973	train-auc:0.91058
[121]	eval-auc:0.77420	train-auc:0.91065
[122]	eval-auc:0.77392	train-auc:0.91089
[123]	eval-auc:0.77491	train-auc:0.91117
[124]	eval-auc:0.77663	train-auc:0.91134
[125]	eval-auc:0.77575	train-auc:0.91180
[126]	eval-auc:0.77501	train-auc:0.91196
[127]	eval-auc:0.77537	train-auc:0.91205
[128]	eval-auc:0.77431	train-auc:0.91229
[129]	eval-auc:0.77301	train-auc:0.91247
[130]	eval-auc:0

[7]	eval-auc:0.80646	train-auc:0.82121
[8]	eval-auc:0.81900	train-auc:0.83459
[9]	eval-auc:0.82351	train-auc:0.83654
[10]	eval-auc:0.82482	train-auc:0.83981
[11]	eval-auc:0.82643	train-auc:0.84176
[12]	eval-auc:0.82788	train-auc:0.84466
[13]	eval-auc:0.82695	train-auc:0.84737
[14]	eval-auc:0.82774	train-auc:0.84840
[15]	eval-auc:0.83008	train-auc:0.85056
[16]	eval-auc:0.83167	train-auc:0.85327
[17]	eval-auc:0.83478	train-auc:0.85571
[18]	eval-auc:0.83288	train-auc:0.85668
[19]	eval-auc:0.83581	train-auc:0.85804
[20]	eval-auc:0.83766	train-auc:0.85942
[21]	eval-auc:0.83988	train-auc:0.86120
[22]	eval-auc:0.84062	train-auc:0.86214
[23]	eval-auc:0.84463	train-auc:0.86493
[24]	eval-auc:0.84365	train-auc:0.86568
[25]	eval-auc:0.84469	train-auc:0.86629
[26]	eval-auc:0.84659	train-auc:0.86744
[27]	eval-auc:0.84884	train-auc:0.86869
[28]	eval-auc:0.84886	train-auc:0.86919
[29]	eval-auc:0.85039	train-auc:0.87052
[30]	eval-auc:0.85143	train-auc:0.87095
[31]	eval-auc:0.85401	train-auc:0.87222
[32

[210]	eval-auc:0.89155	train-auc:0.92775
[211]	eval-auc:0.89127	train-auc:0.92779
[212]	eval-auc:0.89099	train-auc:0.92803
[213]	eval-auc:0.89142	train-auc:0.92814
[214]	eval-auc:0.89124	train-auc:0.92816
[215]	eval-auc:0.89142	train-auc:0.92847
[216]	eval-auc:0.89176	train-auc:0.92860
[217]	eval-auc:0.89157	train-auc:0.92843
[218]	eval-auc:0.89158	train-auc:0.92853
[219]	eval-auc:0.89170	train-auc:0.92891
[220]	eval-auc:0.89202	train-auc:0.92900
[221]	eval-auc:0.89267	train-auc:0.92898
[222]	eval-auc:0.89253	train-auc:0.92912
[223]	eval-auc:0.89339	train-auc:0.92924
[224]	eval-auc:0.89249	train-auc:0.92938
[225]	eval-auc:0.89199	train-auc:0.92960
[226]	eval-auc:0.89251	train-auc:0.92980
[227]	eval-auc:0.89286	train-auc:0.92999
[228]	eval-auc:0.89299	train-auc:0.93009
[229]	eval-auc:0.89276	train-auc:0.93029
[230]	eval-auc:0.89282	train-auc:0.93038
[231]	eval-auc:0.89261	train-auc:0.93039
[232]	eval-auc:0.89243	train-auc:0.93057
[233]	eval-auc:0.89262	train-auc:0.93088
[234]	eval-auc:0

In [17]:
eval_p38 = pd.DataFrame(eval_p38).T
eval_p38.to_csv('../../../../Desktop/binding/thesis english/Results/2-XGBoost/p38.csv')
eval_p38

Unnamed: 0,roc_auc,tn,fp,fn,tp,map,precision,recall,accuracy
Test,0.869295,315.0,45.0,66.0,111.0,0.739396,0.711538,0.627119,0.793296
Ave,0.78459,233.0,30.0,72.0,36.0,0.55555,0.545455,0.333333,0.725067
Random,0.898743,327.0,36.0,63.0,138.0,0.836514,0.793103,0.686567,0.824468


In [19]:
eval_akt1 = {}
for i in range(len(train_list_akt1)):
    if i == 1:
        smi = train_list_akt1[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        train_labels = Y_train_akt1
        dmatrix_train = xgb.DMatrix(data = ECFP_smiles,label = train_labels)
        
        smi = val_list_akt1[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        val_labels = Y_val_akt1
        dmatrix_val = xgb.DMatrix(data = ECFP_smiles,label = val_labels)
        del smi,mols,ECFP,a,ECFP_smiles
        
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    else:
        dmatrix_train,train_labels = class_xgb.to_xgb_input(train_list_akt1[i])
        dmatrix_val,val_labels = class_xgb.to_xgb_input(val_list_akt1[i])
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    
    y_pred_val = model.predict(dmatrix_val)
    if i == 0:
        eval_akt1['Test'] = calculate_metrics(np.array(val_labels),y_pred_val)
    elif i == 1:
        eval_akt1['Random'] = calculate_metrics(np.array(val_labels),y_pred_val)

[0]	eval-auc:0.67759	train-auc:0.69365
[1]	eval-auc:0.71901	train-auc:0.73987
[2]	eval-auc:0.75572	train-auc:0.78521
[3]	eval-auc:0.78253	train-auc:0.80836
[4]	eval-auc:0.78916	train-auc:0.81246
[5]	eval-auc:0.79537	train-auc:0.81803
[6]	eval-auc:0.79456	train-auc:0.82755
[7]	eval-auc:0.81010	train-auc:0.83523
[8]	eval-auc:0.80760	train-auc:0.83491
[9]	eval-auc:0.81445	train-auc:0.84006
[10]	eval-auc:0.82182	train-auc:0.84369
[11]	eval-auc:0.82740	train-auc:0.84862
[12]	eval-auc:0.82874	train-auc:0.85088
[13]	eval-auc:0.83487	train-auc:0.85358
[14]	eval-auc:0.83439	train-auc:0.85654
[15]	eval-auc:0.83587	train-auc:0.86127
[16]	eval-auc:0.83800	train-auc:0.86342
[17]	eval-auc:0.83927	train-auc:0.86329
[18]	eval-auc:0.84213	train-auc:0.86728
[19]	eval-auc:0.84192	train-auc:0.86889
[20]	eval-auc:0.84106	train-auc:0.87017
[21]	eval-auc:0.84483	train-auc:0.87146
[22]	eval-auc:0.84569	train-auc:0.87176
[23]	eval-auc:0.84508	train-auc:0.87222
[24]	eval-auc:0.84598	train-auc:0.87198
[25]	eval-

[203]	eval-auc:0.86550	train-auc:0.91558
[204]	eval-auc:0.86436	train-auc:0.91556
[205]	eval-auc:0.86445	train-auc:0.91563
[206]	eval-auc:0.86518	train-auc:0.91573
[207]	eval-auc:0.86640	train-auc:0.91563
[208]	eval-auc:0.86649	train-auc:0.91553
[209]	eval-auc:0.86649	train-auc:0.91555
[210]	eval-auc:0.86572	train-auc:0.91541
[211]	eval-auc:0.86550	train-auc:0.91552
[212]	eval-auc:0.86595	train-auc:0.91569
[213]	eval-auc:0.86581	train-auc:0.91565
[214]	eval-auc:0.86572	train-auc:0.91566
[215]	eval-auc:0.86522	train-auc:0.91569
[216]	eval-auc:0.86604	train-auc:0.91573
[217]	eval-auc:0.86699	train-auc:0.91590
[218]	eval-auc:0.86790	train-auc:0.91589
[219]	eval-auc:0.86790	train-auc:0.91586
[220]	eval-auc:0.86844	train-auc:0.91589
[221]	eval-auc:0.86885	train-auc:0.91575
[222]	eval-auc:0.86881	train-auc:0.91582
[223]	eval-auc:0.86894	train-auc:0.91588
[224]	eval-auc:0.86899	train-auc:0.91595
[225]	eval-auc:0.86887	train-auc:0.91616
[226]	eval-auc:0.86960	train-auc:0.91628
[227]	eval-auc:0

[106]	eval-auc:0.87628	train-auc:0.90470
[107]	eval-auc:0.87579	train-auc:0.90503
[108]	eval-auc:0.87632	train-auc:0.90502
[109]	eval-auc:0.87567	train-auc:0.90502
[110]	eval-auc:0.87648	train-auc:0.90523
[111]	eval-auc:0.87612	train-auc:0.90540
[112]	eval-auc:0.87636	train-auc:0.90535
[113]	eval-auc:0.87692	train-auc:0.90551
[114]	eval-auc:0.87712	train-auc:0.90554
[115]	eval-auc:0.87845	train-auc:0.90593
[116]	eval-auc:0.87777	train-auc:0.90592
[117]	eval-auc:0.87801	train-auc:0.90642
[118]	eval-auc:0.87785	train-auc:0.90668
[119]	eval-auc:0.87753	train-auc:0.90677
[120]	eval-auc:0.87733	train-auc:0.90694
[121]	eval-auc:0.87801	train-auc:0.90714
[122]	eval-auc:0.87704	train-auc:0.90727
[123]	eval-auc:0.87777	train-auc:0.90765
[124]	eval-auc:0.87785	train-auc:0.90778
[125]	eval-auc:0.87708	train-auc:0.90787
[126]	eval-auc:0.87729	train-auc:0.90783
[127]	eval-auc:0.87680	train-auc:0.90784
[128]	eval-auc:0.87660	train-auc:0.90810
[129]	eval-auc:0.87600	train-auc:0.90812
[130]	eval-auc:0

In [20]:
eval_akt1 = pd.DataFrame(eval_akt1).T
eval_akt1.to_csv('../../../../Desktop/binding/thesis english/Results/2-XGBoost/akt1.csv')
eval_akt1

Unnamed: 0,roc_auc,tn,fp,fn,tp,map,precision,recall,accuracy
Test,0.872686,168.0,22.0,40.0,76.0,0.812078,0.77551,0.655172,0.797386
Random,0.880387,163.0,28.0,34.0,96.0,0.86012,0.774194,0.738462,0.806854


In [21]:
eval_pi3k = {}
for i in range(len(train_list_pi3k)):
    if i == 1:
        smi = train_list_pi3k[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        train_labels = Y_train_pi3k
        dmatrix_train = xgb.DMatrix(data = ECFP_smiles,label = train_labels)
        
        smi = val_list_pi3k[i].rdkit
        mols = [Chem.MolFromSmiles(smi) for smi in smi]
        ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mols]
        a = np.array(ECFP)
        ECFP_smiles = a.astype(np.float32)
        val_labels = Y_val_pi3k
        dmatrix_val = xgb.DMatrix(data = ECFP_smiles,label = val_labels)
        del smi,mols,ECFP,a,ECFP_smiles
        
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    else:
        dmatrix_train,train_labels = class_xgb.to_xgb_input(train_list_pi3k[i])
        dmatrix_val,val_labels = class_xgb.to_xgb_input(val_list_pi3k[i])
        evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
        model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    
    y_pred_val = model.predict(dmatrix_val)
    if i == 0:
        eval_pi3k['Test'] = calculate_metrics(np.array(val_labels),y_pred_val)
    elif i == 1:
        eval_pi3k['Random'] = calculate_metrics(np.array(val_labels),y_pred_val)

[0]	eval-auc:0.72200	train-auc:0.70592
[1]	eval-auc:0.68301	train-auc:0.74260
[2]	eval-auc:0.69628	train-auc:0.76914
[3]	eval-auc:0.69529	train-auc:0.78318
[4]	eval-auc:0.73879	train-auc:0.78736
[5]	eval-auc:0.71432	train-auc:0.80538
[6]	eval-auc:0.72304	train-auc:0.80939
[7]	eval-auc:0.72648	train-auc:0.81340
[8]	eval-auc:0.74481	train-auc:0.81878
[9]	eval-auc:0.75756	train-auc:0.82219
[10]	eval-auc:0.76321	train-auc:0.82756
[11]	eval-auc:0.77043	train-auc:0.83228
[12]	eval-auc:0.76563	train-auc:0.83384
[13]	eval-auc:0.75301	train-auc:0.83662
[14]	eval-auc:0.76732	train-auc:0.83935
[15]	eval-auc:0.77700	train-auc:0.84346
[16]	eval-auc:0.77670	train-auc:0.84503
[17]	eval-auc:0.77566	train-auc:0.84694
[18]	eval-auc:0.77779	train-auc:0.85008
[19]	eval-auc:0.78445	train-auc:0.85382
[20]	eval-auc:0.78817	train-auc:0.85549
[21]	eval-auc:0.79499	train-auc:0.85830
[22]	eval-auc:0.79734	train-auc:0.85938
[23]	eval-auc:0.79707	train-auc:0.86167
[24]	eval-auc:0.79722	train-auc:0.86485
[25]	eval-

[203]	eval-auc:0.86484	train-auc:0.92273
[204]	eval-auc:0.86407	train-auc:0.92266
[205]	eval-auc:0.86165	train-auc:0.92294
[206]	eval-auc:0.86214	train-auc:0.92298
[207]	eval-auc:0.85964	train-auc:0.92307
[208]	eval-auc:0.86035	train-auc:0.92327
[209]	eval-auc:0.86176	train-auc:0.92366
[210]	eval-auc:0.86229	train-auc:0.92377
[211]	eval-auc:0.86227	train-auc:0.92380
[212]	eval-auc:0.86188	train-auc:0.92384
[213]	eval-auc:0.86092	train-auc:0.92397
[214]	eval-auc:0.86199	train-auc:0.92414
[215]	eval-auc:0.86205	train-auc:0.92434
[216]	eval-auc:0.86132	train-auc:0.92441
[217]	eval-auc:0.86307	train-auc:0.92464
[218]	eval-auc:0.86268	train-auc:0.92482
[219]	eval-auc:0.86197	train-auc:0.92489
[220]	eval-auc:0.86332	train-auc:0.92493
[221]	eval-auc:0.86369	train-auc:0.92509
[222]	eval-auc:0.86287	train-auc:0.92524
[223]	eval-auc:0.86392	train-auc:0.92541
[224]	eval-auc:0.86310	train-auc:0.92558
[225]	eval-auc:0.86375	train-auc:0.92568
[226]	eval-auc:0.86383	train-auc:0.92548
[227]	eval-auc:0

[106]	eval-auc:0.87724	train-auc:0.90869
[107]	eval-auc:0.87723	train-auc:0.90856
[108]	eval-auc:0.87717	train-auc:0.90866
[109]	eval-auc:0.87713	train-auc:0.90901
[110]	eval-auc:0.87713	train-auc:0.90933
[111]	eval-auc:0.87723	train-auc:0.90949
[112]	eval-auc:0.87796	train-auc:0.90961
[113]	eval-auc:0.87913	train-auc:0.91026
[114]	eval-auc:0.87993	train-auc:0.91056
[115]	eval-auc:0.88050	train-auc:0.91077
[116]	eval-auc:0.88037	train-auc:0.91091
[117]	eval-auc:0.88066	train-auc:0.91124
[118]	eval-auc:0.88077	train-auc:0.91174
[119]	eval-auc:0.88125	train-auc:0.91204
[120]	eval-auc:0.88188	train-auc:0.91237
[121]	eval-auc:0.88198	train-auc:0.91252
[122]	eval-auc:0.88317	train-auc:0.91272
[123]	eval-auc:0.88284	train-auc:0.91282
[124]	eval-auc:0.88321	train-auc:0.91313
[125]	eval-auc:0.88310	train-auc:0.91329
[126]	eval-auc:0.88337	train-auc:0.91342
[127]	eval-auc:0.88354	train-auc:0.91360
[128]	eval-auc:0.88378	train-auc:0.91388
[129]	eval-auc:0.88358	train-auc:0.91397
[130]	eval-auc:0

In [22]:
eval_pi3k = pd.DataFrame(eval_pi3k).T
eval_pi3k.to_csv('../../../../Desktop/binding/thesis english/Results/2-XGBoost/pi3k.csv')
eval_pi3k

Unnamed: 0,roc_auc,tn,fp,fn,tp,map,precision,recall,accuracy
Test,0.869295,315.0,45.0,66.0,111.0,0.739396,0.711538,0.627119,0.793296
Random,0.900694,334.0,43.0,49.0,138.0,0.828438,0.762431,0.737968,0.836879
