In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score, precision_recall_curve, classification_report, matthews_corrcoef, confusion_matrix

from xgboost import XGBClassifier

# import packages for hyperparameters tuning
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [2]:
model_for_testing = 'protTrans'

In [3]:
df = pd.read_csv('./Datasets/My_Dataset/proteins_embeddings_' + model_for_testing + '_annotated.csv', index_col=0)

# convert residues to one-hot encode
df_res = pd.get_dummies(df['residue_1l'])

df = df.merge(df_res, left_index=True, right_index=True, how='inner')
df.insert(len(df.columns)-1, 'is_IBS', df.pop('is_IBS'))
df.head(5)

Unnamed: 0_level_0,uniprot_id,residue_1l,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,N,P,Q,R,S,T,V,W,Y,is_IBS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,O01761,M,0.038092,-0.25762,0.053704,0.242058,0.110389,0.01671,-0.446441,-0.020999,...,0,0,0,0,0,0,0,0,0,0
1,O01761,A,-0.126368,-0.26676,0.016587,0.087955,-0.065079,0.004339,-0.374669,-0.106561,...,0,0,0,0,0,0,0,0,0,0
2,O01761,S,-0.141514,-0.019615,0.152169,0.392465,-0.030234,-0.063051,-0.462616,-0.187547,...,0,0,0,0,1,0,0,0,0,0
3,O01761,R,-0.20435,-0.102457,-0.04608,0.26017,-0.01623,-0.096556,-0.416828,-0.525503,...,0,0,0,1,0,0,0,0,0,0
4,O01761,R,-0.095794,-0.068736,-0.037263,0.224324,0.040529,0.009453,-0.429543,-0.505712,...,0,0,0,1,0,0,0,0,0,0


In [4]:
f = open('./Datasets/My_Dataset/split_proteins.json')
dict_proteins_split = json.load(f)
f.close()

df_train = df[df.uniprot_id.isin(dict_proteins_split['train'])]
df_test = df[df.uniprot_id.isin(dict_proteins_split['test'])]
df_val = df[df.uniprot_id.isin(dict_proteins_split['val'])]

In [5]:
X_train, y_train = df_train.drop(['uniprot_id', 'residue_1l', 'is_IBS'], axis=1, inplace=False), df_train['is_IBS']
X_test, y_test = df_test.drop(['uniprot_id', 'residue_1l', 'is_IBS'], axis=1, inplace=False), df_test['is_IBS']
X_val, y_val = df_val.drop(['uniprot_id', 'residue_1l', 'is_IBS'], axis=1, inplace=False), df_val['is_IBS']

In [6]:
del df
del df_res

### Fine-tune XGBoost

In [7]:
clf = XGBClassifier(objective='binary:logistic', random_state=0, n_jobs=7)


xgb_reg_params = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(1, 30),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'reg_lambda': Real(1e-9, 100., 'uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 100., 'uniform'), # L1 regularization
                 'n_estimators': Integer(50, 5000),
                 'min_child_weight': Integer(0, 20),
                 'gamma': Real(1e-9, 1.0, 'uniform'),
                 'scale_pos_weight': Real(1e-6, 100,'log-uniform')
}

xgb_bs = BayesSearchCV(estimator = clf, search_spaces = xgb_reg_params, cv=5, random_state=42, verbose=1,  n_jobs = 6, n_iter = 10)
#Best
results_bs_xgb = xgb_bs.fit(X_train, y_train)
best_model = xgb_bs.best_estimator_

print(results_bs_xgb.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


protTrans
('colsample_bytree', 0.7606252161870115), ('gamma', 0.939369737663402), ('learning_rate', 0.17197110685216477), ('max_depth', 6), ('min_child_weight', 16), ('n_estimators', 1894), ('reg_alpha', 45.90245141562154), ('reg_lambda', 53.47651601257879), ('scale_pos_weight', 1.9154529871828567), ('subsample', 0.6016844832739164)


In [10]:
y_train.value_counts()[0]/y_train.value_counts()[1]

34.76696342637151

In [15]:
params = {'objective': 'binary:logistic' ,'colsample_bytree': 0.7606252161870115, 'gamma': 0.939369737663402, 'learning_rate': 0.17197110685216477, 'max_depth': 6, 'min_child_weight': 16,
 'n_estimators': 1894, 'reg_alpha': 45.90245141562154, 'reg_lambda': 53.47651601257879, 'subsample': 0.6016844832739164, 'random_state': 0, 
 'n_jobs': 7, 'scale_pos_weight': 1.9154529871828567}

clf = XGBClassifier(**params)

# fit to model
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [12]:
ypred = clf.predict(X_test)

print('F1 score: %.3f ' % f1_score(y_test, ypred))
print('MCC: %.3f ' % matthews_corrcoef(y_test, ypred))
print('Balanced accuracy: %.3f ' % balanced_accuracy_score(y_test, ypred))
print('ROC : %.3f ' % roc_auc_score(y_test, ypred))

cm = confusion_matrix(y_test, ypred)
print(cm)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
print(cm)

F1 score: 0.655 
MCC: 0.612 
Balanced accuracy: 0.803 
ROC : 0.803 
[[6628  284]
 [ 305  559]]
      0    1
0  6628  284
1   305  559


protTrans <br />
F1 score: 0.647 <br />
MCC: 0.605 <br />
Balanced accuracy: 0.793 <br />
ROC : 0.793 <br />
      0    1
0  6646  266
1   324  540
<br />

In [13]:
# save in JSON format
clf.save_model('xgboost_' + model_for_testing + '_3.json')

### Load Model

In [12]:
model = XGBClassifier()
model.load_model('./models/xgboost_' + model_for_testing + '.json')

ypred = model.predict(X_test)

print('F1 score: %.3f ' % f1_score(y_test, ypred))
print('MCC: %.3f ' % matthews_corrcoef(y_test, ypred))
print('Balanced accuracy: %.3f ' % balanced_accuracy_score(y_test, ypred))
print('Accuracy: %.3f ' % accuracy_score(y_test, ypred))
print('ROC : %.3f ' % roc_auc_score(y_test, ypred))

cm = confusion_matrix(y_test, ypred)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])

F1 score: 0.647 
MCC: 0.605 
Balanced accuracy: 0.793 
Accuracy: 0.924 
ROC : 0.793 
