In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [81]:
df = pd.read_csv('proteins_dataframe.csv', index_col=0)

In [9]:
X = df[['residue_number', 'res_index', 'Hydrophobicity', 'Hydrophilicity', 'NCI', 'Polarity', 'Polarizability', 'SASA', 'V', 'is_charged']]
y = df['tm_segment']

In [7]:
# Data normilizations
scaler = StandardScaler()
scaler.fit(X)
scaled = scaler.fit_transform(X)
scaled_df = pd.DataFrame(scaled, columns=X.columns)
scaled_df

Unnamed: 0,residue_number,Hydrophobicity,Hydrophilicity,NCI,Polarity,Polarizability,SASA,V,is_charged
0,-1.268115,-1.235368,2.046999,-1.111856,2.103068,-0.561175,-0.316707,-0.789434,2.268825
1,-0.037428,0.827785,-0.777668,0.109571,-1.166392,0.308144,0.440488,0.677903,-0.440757
2,-0.037428,0.827785,-0.777668,0.109571,-1.166392,0.308144,0.440488,0.677903,-0.440757
3,0.314196,-1.109052,0.399277,-0.639219,1.537976,-0.249938,-0.167029,-0.276552,-0.440757
4,-1.268115,-1.235368,2.046999,-1.111856,2.103068,-0.561175,-0.316707,-0.789434,2.268825
...,...,...,...,...,...,...,...,...,...
138180,1.369070,0.848838,-0.601126,0.195840,-0.762755,-0.185544,-0.189040,0.074512,-0.440757
138181,-0.213241,-1.866945,2.046999,-0.439951,1.416885,0.662310,1.160264,0.856178,2.268825
138182,-0.213241,-1.866945,2.046999,-0.439951,1.416885,0.662310,1.160264,0.856178,2.268825
138183,-0.213241,-1.866945,2.046999,-0.439951,1.416885,0.662310,1.160264,0.856178,2.268825


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False) # 7 indicates that always get the same split of data each time this example is executed

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# xgboost parameters
param = {'max_depth': 12, 'eta': 1, 'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'auc', 'n_estimators': 100, 'learning_rate': 0.01}
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	eval-auc:0.66145	train-auc:0.70841
[1]	eval-auc:0.66172	train-auc:0.70840
[2]	eval-auc:0.66195	train-auc:0.70871
[3]	eval-auc:0.66186	train-auc:0.70887
[4]	eval-auc:0.66225	train-auc:0.70931
[5]	eval-auc:0.66256	train-auc:0.70952
[6]	eval-auc:0.66260	train-auc:0.70968
[7]	eval-auc:0.66304	train-auc:0.71011
[8]	eval-auc:0.66333	train-auc:0.71025
[9]	eval-auc:0.66367	train-auc:0.71042


In [26]:
# fit model on training data
model = XGBClassifier(objective="binary:logistic", n_estimators = 100, learning_rate = 0.01, max_depth = 12)
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Balanced accuracy: 62.46%
Roc: 66.60%
F1 score: 61.11%


In [8]:
inner_metric = 'balanced_accuracy_score'
outer_metrics = ['balanced_accuracy_score', 'average_precision', 'f1']


# configure the cross-validation procedure
cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
cv_outer = StratifiedKFold(n_splits=10, shuffle=True)

In [11]:
model = XGBClassifier(objective="binary:logistic")

param_grid = {
  "max_depth": [1, 3, 7, 10],
  "n_estimators": [10, 500, 1000],
  "learning_rate": [0.01, 0.1, 0.2],
}

# define search
rf_grid = GridSearchCV(model, param_grid, scoring='balanced_accuracy', n_jobs=-1, cv=cv_inner, refit=True)

# execute the nested cross-validation
scores = cross_validate(rf_grid, scaled_df, y, scoring='balanced_accuracy', cv=cv_outer, n_jobs=-1)

#Best
rf_grid.fit(X, y)
# params[r] = rf_grid.best_params_
print("Best parameters: ", rf_grid.best_params_)

# scores[r] = rf_grid.best_score_
print("Best score: ", rf_grid.best_score_)

Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532102797542
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532386523935
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532044164002
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.612953192615667
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532610790335
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532122198383
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.612953238518026
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129532072701179
Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 10}
Best score:  0.6129531919977071
Best paramet

In [27]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train, y_train)

ypred = svm.predict(X_test)

print('Mathwe cor is : %.2f%%' % ( matthews_corrcoef(y_test.values, ypred) * 100.0))
print('Accuracy cor is : %.2f%%' % (balanced_accuracy_score(y_test.values, ypred) * 100.0))

Mathwe cor is : 23.79%
Accuracy cor is : 61.91%


In [69]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
# param = {'num_leaves': 70, 'objective': 'binary'}
# param['metric'] = 'auc'

model = lgb.LGBMClassifier(learning_rate=0.3, max_depth=12, num_leaves=70, num_iterations=500, objective='binary')

# num_round = 10
# lgb.cv(param, train_data, num_round, nfold=5)
model.fit(X_train,y_train, eval_set=[(X_test,y_test),(X_train,y_train)], verbose=20, eval_metric=['auc', 'logloss', 'average_precision'])



[20]	training's auc: 0.702665	training's binary_logloss: 0.616961	training's average_precision: 0.621857	valid_0's auc: 0.675617	valid_0's binary_logloss: 0.639261	valid_0's average_precision: 0.604244
[40]	training's auc: 0.710493	training's binary_logloss: 0.611079	training's average_precision: 0.632799	valid_0's auc: 0.671358	valid_0's binary_logloss: 0.643621	valid_0's average_precision: 0.599755
[60]	training's auc: 0.71549	training's binary_logloss: 0.607267	training's average_precision: 0.639198	valid_0's auc: 0.667483	valid_0's binary_logloss: 0.646858	valid_0's average_precision: 0.595502
[80]	training's auc: 0.71945	training's binary_logloss: 0.604323	training's average_precision: 0.64388	valid_0's auc: 0.66522	valid_0's binary_logloss: 0.649225	valid_0's average_precision: 0.59272
[100]	training's auc: 0.72254	training's binary_logloss: 0.602057	training's average_precision: 0.647789	valid_0's auc: 0.66303	valid_0's binary_logloss: 0.651247	valid_0's average_precision: 0.590

LGBMClassifier(learning_rate=0.3, max_depth=12, num_iterations=500,
               num_leaves=70, objective='binary')

In [63]:
print('Training accuracy {:.4f}'.format(model.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test,y_test)))

Training accuracy 0.6660
Testing accuracy 0.6098


In [None]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error