In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,f1_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/MDA/train_50.0_mda_dt_selected.csv")
test = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/MDA/test_50.0_mda_dt_selected.csv")

In [None]:
train.shape

(2800, 513)

In [None]:
test.shape

(700, 513)

In [None]:
train.head()

Unnamed: 0,f_207,f_944,f_135,f_518,f_599,f_654,f_663,f_1,f_2,f_3,...,f_515,f_516,f_517,f_519,f_520,f_522,f_523,f_524,f_525,label
0,0.0,0.0,0.0,4.163074,3.593174,0.590041,0.0,0.0,0.0,0.0,...,0.0,5.021816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1,0.0,0.0,0.808496,0.0,1.872534,0.0,0.063859,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.671787,0.0,0.0,0.0,1.831867,3.0
2,4.669699,0.0,1.025051,0.557915,0.0,0.0,6.259234,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,0.0,0.0,1.405728,0.0,1.668143,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.276529,0.0,0.0,0.0,2.066074,3.0
4,0.0,0.0,1.016898,0.0,1.410549,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.751897,0.0,0.0,0.0,1.279927,3.0


In [None]:
X_train = train.drop(["label"], axis = 1) #, "loss","plof_scores"
X_test = test.drop(["label"], axis = 1)
y_train = train["label"]
y_test = test["label"]

#KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(
    n_neighbors=9,
    weights='distance',
    algorithm='auto',
    leaf_size=50
)

k = 5
scores = cross_val_score(knn, X_train, y_train, cv=k)
print("Accuracy scores for each fold:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())


knn.fit(X_train, y_train)


test_pred = knn.predict(X_test)
test_pred_prob = knn.predict_proba(X_test)
auc = roc_auc_score(y_test, test_pred_prob, multi_class='ovr')

conf_matrix = confusion_matrix(y_test, test_pred)

# Compute sensitivity and specificity for each class
sensitivity = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
specificity = []
tn=0
fp=0
fn=0
tp = 0
for i in range(len(conf_matrix)):
    temp = np.delete(conf_matrix, i, 0)
    temp = np.delete(temp, i, 1)
    tn, fp, fn, tp = temp.ravel()[:4]
    specificity.append(tn / (tn + fp))
avg_sensitivity = np.mean(sensitivity)
avg_specificity = np.mean(specificity)


test_acc = accuracy_score(y_test, test_pred)
test_mcc = matthews_corrcoef(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average = 'weighted')

print('ACCURACY : %s' % test_acc)
print('MCC : %s' % test_mcc)
print('F1 : %s' % test_f1)
print('AUC: %s' % auc)
print('Sensitivity: %s' % avg_sensitivity)
print('Specivity: %s' %avg_specificity)

In [None]:
with open('knn_model.pkl', 'wb') as model_file:
    pickle.dump(knn, model_file)


#SVM

In [None]:
from sklearn.svm import SVC

svm = SVC(
    kernel='rbf',
    C=0.1,
    gamma='scale',
    probability=True,
    decision_function_shape='ovr'
)
k = 5
scores = cross_val_score(svm, X_train, y_train, cv=k)
print("Accuracy scores for each fold:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())


svm.fit(X_train, y_train)


test_pred = svm.predict(X_test)
test_pred_prob = svm.predict_proba(X_test)
auc = roc_auc_score(y_test, test_pred_prob, multi_class='ovr')

conf_matrix = confusion_matrix(y_test, test_pred)

# Compute sensitivity and specificity for each class
sensitivity = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
specificity = []
for i in range(len(conf_matrix)):
    temp = np.delete(conf_matrix, i, 0)
    temp = np.delete(temp, i, 1)
    tn, fp, fn, tp = temp.ravel()[:4]
    specificity.append(tn / (tn + fp))
avg_sensitivity = np.mean(sensitivity)
avg_specificity = np.mean(specificity)


test_acc = accuracy_score(y_test, test_pred)
test_mcc = matthews_corrcoef(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average = 'weighted')

print('ACCURACY : %s' % test_acc)
print('MCC : %s' % test_mcc)
print('F1 : %s' % test_f1)
print('AUC: %s' % auc)
print('Sensitivity: %s' % avg_sensitivity)
print('Specivity: %s' %avg_specificity)

In [None]:
with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(svm, model_file)


#RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(
    n_estimators=1200,
    max_depth=10,
    min_samples_leaf=1,
    random_state=42
)
k = 5
scores = cross_val_score(rfc, X_train, y_train, cv=k)
print("Accuracy scores for each fold:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())


rfc.fit(X_train, y_train)


test_pred = rfc.predict(X_test)
test_pred_prob = rfc.predict_proba(X_test)
auc = roc_auc_score(y_test, test_pred_prob, multi_class='ovr')

conf_matrix = confusion_matrix(y_test, test_pred)

# Compute sensitivity and specificity for each class
sensitivity = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
specificity = []
for i in range(len(conf_matrix)):
    temp = np.delete(conf_matrix, i, 0)
    temp = np.delete(temp, i, 1)
    tn, fp, fn, tp = temp.ravel()[:4]
    specificity.append(tn / (tn + fp))
avg_sensitivity = np.mean(sensitivity)
avg_specificity = np.mean(specificity)


test_acc = accuracy_score(y_test, test_pred)
test_mcc = matthews_corrcoef(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average = 'weighted')

print('ACCURACY : %s' % test_acc)
print('MCC : %s' % test_mcc)
print('F1 : %s' % test_f1)
print('AUC: %s' % auc)
print('Sensitivity: %s' % avg_sensitivity)
print('Specivity: %s' %avg_specificity)

In [None]:
with open('rfc_model.pkl', 'wb') as model_file:
    pickle.dump(rfc, model_file)

#MLP

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    alpha=0.1,
    max_iter=1000
)
k = 5
scores = cross_val_score(mlp, X_train, y_train, cv=k)
print("Accuracy scores for each fold:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())


mlp.fit(X_train, y_train)


test_pred = mlp.predict(X_test)
test_pred_prob = mlp.predict_proba(X_test)
auc = roc_auc_score(y_test, test_pred_prob, multi_class='ovr')

conf_matrix = confusion_matrix(y_test, test_pred)

# Compute sensitivity and specificity for each class
sensitivity = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
specificity = []
for i in range(len(conf_matrix)):
    temp = np.delete(conf_matrix, i, 0)
    temp = np.delete(temp, i, 1)
    tn, fp, fn, tp = temp.ravel()[:4]
    specificity.append(tn / (tn + fp))
avg_sensitivity = np.mean(sensitivity)
avg_specificity = np.mean(specificity)


test_acc = accuracy_score(y_test, test_pred)
test_mcc = matthews_corrcoef(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred, average = 'weighted')

print('ACCURACY : %s' % test_acc)
print('MCC : %s' % test_mcc)
print('F1 : %s' % test_f1)
print('AUC: %s' % auc)
print('Sensitivity: %s' % avg_sensitivity)
print('Specivity: %s' %avg_specificity)

In [None]:
with open('mlp_model.pkl', 'wb') as model_file:
    pickle.dump(mlp, model_file)
