In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import roc_curve, auc, accuracy_score, average_precision_score, f1_score

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('proteins_dataframe.csv')
df

Unnamed: 0.1,Unnamed: 0,pdb_id,chain_id,sequence,residue,residue_number,res_index,pdb_res_index,hydrophobicity,hydrophilicity,is_charged,is_tm_segment
0,0,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,D,3,0,5,-0.90,3.0,1,0
1,1,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,L,10,1,6,1.06,-1.8,0,0
2,2,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,L,10,2,7,1.06,-1.8,0,0
3,3,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,N,12,3,8,-0.78,0.2,0,0
4,4,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,D,3,4,9,-0.90,3.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
161310,161310,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,V,18,211,206,1.08,-1.5,0,0
161311,161311,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,212,207,-1.50,3.0,1,0
161312,161312,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,213,208,-1.50,3.0,1,0
161313,161313,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,214,209,-1.50,3.0,1,0


In [5]:
X = df[['residue_number', 'res_index', 'hydrophobicity', 'hydrophilicity', 'is_charged']]
y = df['is_tm_segment']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7) # 7 indicates that always get the same split of data each time this example is executed

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
# xgboost parameters
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'auc'}
evallist = [(dtest, 'eval'), (dtrain, 'train')]

In [None]:
num_round = 200
bst = xgb.train(param, dtrain, num_round, evallist)

In [None]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

In [21]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [23]:

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 64.82%


In [None]:
model = XGBClassifier(objective="binary:logistic")

param_grid = {
  "max_depth": [1, 3, 5, 8, 10],
  "n_estimators": [10, 500, 1000],
  "learning_rate": [0.01, 0.1, 0.2],
}

inner_metric = 'roc_auc'
outer_metrics = ['roc_auc', 'average_precision', 'f1']

cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
cv_outer = StratifiedKFold(n_splits=10, shuffle=True)

# define search
xboost_model = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

# execute the nested cross-validation
scores = cross_validate(xboost_model, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1, return_estimator=True)

scores

In [None]:
xboost_model.best_params_