In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, balanced_accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('final_proteins_dataframe.csv', index_col=0)

X = df.iloc[:, 4:43]
X = X.drop(['pdb_res_index'], axis=1)
X = X.drop(['res_index'], axis=1)
y = df['tm_segment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False) # 7 indicates that always get the same split of data each time this example is executed

In [6]:
from xgboost import XGBClassifier

# fit model on training data
model = XGBClassifier(objective="binary:logistic")
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.24%
Balanced accuracy: 76.98%
Roc: 84.04%
F1 score: 74.97%


In [3]:
from sklearn.svm import SVC

svm = SVC(gamma='auto', probability=True, kernel="linear")
svm.fit(X_train, y_train)

ypred = svm.predict(X_test)

# evaluate ypred
accuracy = accuracy_score(y_test.values, ypred)
print("Aaccuracy: %.2f%%" % (accuracy * 100.0))

# evaluate ypred
accuracy_bl = balanced_accuracy_score(y_test.values, ypred)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test.values, svm.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test.values, ypred)
print("F1 score: %.2f%%" % (f1* 100.0))

Aaccuracy: 78.06%
Balanced accuracy: 77.83%
Roc: 85.62%
F1 score: 76.05%


In [4]:
import lightgbm as lgb

model = lgb.LGBMClassifier(objective='binary')

model.fit(X_train,y_train, verbose=20, eval_metric=['auc', 'logloss', 'average_precision'])

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.59%
Balanced accuracy: 77.29%
Roc: 84.97%
F1 score: 75.19%


In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train,y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# evaluate predictions
accuracy_bl = balanced_accuracy_score(y_test, predictions)
print("Balanced accuracy: %.2f%%" % (accuracy_bl * 100.0))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("Roc: %.2f%%" % (roc_score* 100.0))

f1 = f1_score(y_test, predictions)
print("F1 score: %.2f%%" % (f1* 100.0))

Accuracy: 77.27%
Balanced accuracy: 76.93%
Roc: 84.13%
F1 score: 74.66%
