In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('final_proteins_dataframe.csv', index_col=0)

X = df.iloc[:, 4:43]
X = X.drop(['pdb_res_index'], axis=1)
X = X.drop(['res_index'], axis=1)
y = df['tm_segment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False) # 7 indicates that always get the same split of data each time this example is executed

In [3]:
inner_metric = 'accuracy'
outer_metrics = ['roc_auc', 'accuracy', 'f1']

# configure the cross-validation procedure
cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
cv_outer = StratifiedKFold(n_splits=10, shuffle=True)

In [None]:
# define the model
rf = RandomForestClassifier()

# define parameters
rf_params = {"n_estimators": Integer(100, 2000),
                 "max_depth": Integer(1, 100),
                 "min_samples_split": Integer(2, 10),
                 "min_samples_leaf": Integer(1, 4),
                 "bootstrap": ([True, False])
                  }

outputs_rf = {}
params_rf = []
scores_list_rf = []
results_rf = []
rounds = 2

for i in range(rounds):
    # define model
    rf_model = BayesSearchCV(estimator = rf,  scoring=inner_metric,  search_spaces = rf_params, cv = cv_inner, verbose=1,  n_jobs = 2, refit=True, n_iter=30)
    
    # execute the nested cross-validation
    scores = cross_validate(rf_model, X, y, scoring=outer_metrics, cv=cv_outer, n_jobs=2, verbose=1)
    
    # report performance
    output = pd.DataFrame(scores)
    outputs_rf[i] = output

    #Best
    rf_model.fit(X, y)
    param = rf_model.best_params_
    print("\tBest parameters:", param)
    params_rf.append(param)
    
    score = rf_model.best_score_
    print("\tBest score:", score)
    scores_list_rf.append(score)
    
    
    result = pd.DataFrame(rf_model.cv_results_)
    results_rf.append(result)

In [None]:
svc = SVC()

param_grid = {
             'C': Real(1e-6, 1e+6, prior='log-uniform'),
             'gamma': Real(1e-6,  0.5, prior='log-uniform'),
             'degree': Integer(1,8),
             'kernel': Categorical(['linear', 'poly', 'rbf']),
         }
outputs_svc = {}
params_svc = []
scores_list_svc = []
results_svc = []
rounds = 2

for i in range(rounds):
    svc_model = BayesSearchCV(estimator = svc, search_spaces = param_grid, cv = cv_inner, scoring=inner_metric, verbose=1,  n_jobs = -1, refit=True, n_iter=30)
    # execute the nested cross-validation
    scores = cross_validate(svc_model, X, y, scoring=outer_metrics, cv=cv_outer, n_jobs=-1,verbose=1)
    
    # report performance
    output = pd.DataFrame(scores)
    outputs_svc[i] = output
    
    # perform the search
    svc_model.fit(X, y)
    # report the best result
    
    param = svc_model.best_params_
    print("\tBest parameters:", param)
    params_svc.append(param)
    
    score = svc_model.best_score_
    print("\tBest score:", score)
    scores_list_svc.append(score)
    
    result = pd.DataFrame(svc_model.cv_results_)
    results_svc.append(result)

In [None]:
# define the model
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# define parameters
param_grid = {
    "max_depth": (1, 20),
    "n_estimators": (10, 2000),
    "learning_rate": (0.01, 0.3),
    'subsample': (0.01, 1.0, 'uniform'),
    'gamma': Real(1e-5, 0.5, 'log-uniform'),
    'scale_pos_weight': (1, 5)
}
outputs_xgb = {}
params_xgb = []
scores_list_xgb = []
results_xgb = []
rounds = 2

for i in range(rounds):
    xgb_model = BayesSearchCV(estimator = xgb_cl, scoring=inner_metric, search_spaces = param_grid, cv = cv_inner, verbose=1,  n_jobs = -1, refit=True, n_iter=30)
    
    # execute the nested cross-validation
    scores = cross_validate(xgb_model, X, y, scoring=outer_metrics, cv=cv_outer, n_jobs=-1, verbose=1, error_score='raise')
    
    # report performance
    output = pd.DataFrame(scores)
    outputs_xgb[i] = output
    
    #Best
    xgb_model.fit(X, y)
    param = xgb_model.best_params_
    print("\tBest parameters:", param)
    params_xgb.append(param)
    
    score = xgb_model.best_score_
    print("\tBest score:", score)
    scores_list_xgb.append(score)
    
    result = pd.DataFrame(xgb_model.cv_results_)
    results_xgb.append(result)

In [None]:
# define the model
lgbm_cl = lgb.LGBMClassifier(objective='binary')

# define parameters
param_grid = {
    'learning_rate': (0.01, 1),
    'max_depth': (5, 25),
    'num_leaves': (20, 100),
    'num_iterations': (100, 2000),
    'n_estimators': (50, 1000),
    'reg_alpha': (0.01, 3),
    'reg_lambda': (0.01, 3),
    'bagging_fraction': (0.01, 1),
    'bagging_freq': (0, 1),
    'colsample_bytree': (0.01, 1),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (5, 50)
}
outputs_lgbm = {}
params_lgbm = []
scores_list_lgbm = []
results_lgbm = []
rounds = 2

for i in range(rounds):
    lgbm_model = BayesSearchCV(estimator = lgbm_cl, scoring=inner_metric, search_spaces = param_grid, cv = cv_inner, verbose=1,  n_jobs = -1, refit=True, n_iter=30)
    
    # execute the nested cross-validation
    scores = cross_validate(lgbm_model, X_test, y_test, scoring=outer_metrics, cv=cv_outer, n_jobs=-1, verbose=1, error_score='raise')
    
    # report performance
    output = pd.DataFrame(scores)
    outputs_lgbm[i] = output
    
    #Best
    lgbm_model.fit(X, y)
    param = lgbm_model.best_params_
    print("\tBest parameters:", param)
    params_lgbm.append(param)
    
    score = lgbm_model.best_score_
    print("\tBest score:", score)
    scores_list_lgbm.append(score)
    
    result = pd.DataFrame(lgbm_model.cv_results_)
    results_lgbm.append(result)

In [None]:
## LGBM Grid search
model = lgb.LGBMClassifier(objective="binary")

param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

# define search
rf_grid = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=2)

#Best
rf_grid.fit(X_train.drop(['pdb_id'], axis=1), y_train)
print("Best parameters: ", rf_grid.best_params_)
print("Best score: ", rf_grid.best_score_)

### Other

In [6]:
# fit model on training data
model = XGBClassifier(objective="binary:logistic")
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.24%
Balanced accuracy: 76.98%
Roc: 84.04%
F1 score: 74.97%


In [3]:
svm = SVC(gamma='auto', probability=True, kernel="linear")
svm.fit(X_train, y_train)

ypred = svm.predict(X_test)

# evaluate ypred
accuracy = accuracy_score(y_test.values, ypred)
print("Aaccuracy: %.2f%%" % (accuracy * 100.0))

# evaluate ypred
accuracy_bl = balanced_accuracy_score(y_test.values, ypred)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test.values, svm.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test.values, ypred)
print("F1 score: %.2f%%" % (f1* 100.0))

Aaccuracy: 78.06%
Balanced accuracy: 77.83%
Roc: 85.62%
F1 score: 76.05%


In [4]:
model = lgb.LGBMClassifier(objective='binary')

model.fit(X_train,y_train, verbose=20, eval_metric=['auc', 'logloss', 'average_precision'])

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.59%
Balanced accuracy: 77.29%
Roc: 84.97%
F1 score: 75.19%


In [5]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.27%
Balanced accuracy: 76.93%
Roc: 84.13%
F1 score: 74.66%
