In [62]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# # scikit-learn imports for machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# # tensorflow for nn
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [142]:
data = pd.read_csv("data_mean.csv")
data.shape

(780, 51)

In [143]:
# Results storage for confusion matrices, classification reports, and ROC curves
confusion_matrices = []
roc_curves = []
classification_reports = []
accuracy = []

In [144]:
FEATURES = ['Age', 'BMI', 'Sex', 'Height', 'Weight', 'Length_of_Stay',
       'Alvarado_Score', 'Paedriatic_Appendicitis_Score', 'Appendix_on_US',
       'Appendix_Diameter', 'Migratory_Pain', 'Lower_Right_Abd_Pain',
       'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea',
       'Loss_of_Appetite', 'Body_Temperature', 'WBC_Count',
       'Neutrophil_Percentage', 'Segmented_Neutrophils', 'Neutrophilia',
       'RBC_Count', 'Hemoglobin', 'RDW', 'Thrombocyte_Count',
       'Ketones_in_Urine', 'RBC_in_Urine', 'WBC_in_Urine', 'CRP', 'Dysuria',
       'Stool', 'Peritonitis', 'Psoas_Sign', 'Ipsilateral_Rebound_Tenderness',
       'US_Performed', 'Free_Fluids', 'Appendix_Wall_Layers', 'Target_Sign',
       'Appendicolith', 'Perfusion', 'Perforation',
       'Surrounding_Tissue_Reaction', 'Appendicular_Abscess',
       'Pathological_Lymph_Nodes', 'Bowel_Wall_Thickening',
       'Conglomerate_of_Bowel_Loops', 'Ileus', 'Coprostasis', 'Meteorism',
       'Enteritis']

TARGET = 'Diagnosis'

In [145]:
X_train, X_test, y_train, y_test = train_test_split(data[FEATURES], data[TARGET], test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((546, 50), (234, 50), (546,), (234,))

In [146]:
# Imputer and scaler object
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

In [151]:
# Create the model and a pipeline
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
pipeline = Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('classifier', rf_model)])

# Create confusion matrix
pipeline.fit(X_train, y_train)
y_pred_rf = pipeline.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)
confusion_matrices.append(cm_rf)

# 5 fold CV
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate Scores
cv_scores_accuracy = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_accuracy = cv_scores_accuracy.mean()
cv_scores_precision = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_precision = cv_scores_precision.mean()
cv_scores_recall = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="recall")
mean_cv_recall = cv_scores_recall.mean()
cv_scores_f1 = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="f1")
mean_cv_f1 = cv_scores_f1.mean()

# Calculating false negative rate
tn, fp, fn, tp = cm_rf.ravel()
false_negative_rate = fn / (fn + tp)

print("Confusion Matrix:\n", cm_rf)
print(" ")
print("5-fold CV:")
print("Accuracy:", round(mean_cv_accuracy,4))
print("Precision:", round(mean_cv_precision,4))
print("Recall:", round(mean_cv_recall,4))
print("F1:", round(mean_cv_f1,4))
print("False Negative Rate:", round(false_negative_rate,4))

Confusion Matrix:
 [[ 87  13]
 [ 12 122]]
 
5-fold CV:
Accuracy: 0.8992
Precision: 0.8992
Recall: 0.9144
F1: 0.9151
False Negative Rate: 0.08955223880597014


In [152]:
# Create the model and a pipeline
lr_model = LogisticRegression(max_iter=1000, random_state=42)
pipeline = Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('classifier', lr_model)])

# Create confusion matrix
pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_lr)
confusion_matrices.append(cm_lr)

# 5 fold CV
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate Scores
cv_scores_accuracy = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_accuracy = cv_scores_accuracy.mean()
cv_scores_precision = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_precision = cv_scores_precision.mean()
cv_scores_recall = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="recall")
mean_cv_recall = cv_scores_recall.mean()
cv_scores_f1 = cross_val_score(pipeline, X_train, y_train, cv=k_fold, scoring="f1")
mean_cv_f1 = cv_scores_f1.mean()

# Calculating false negative rate
tn, fp, fn, tp = cm_lr.ravel()
false_negative_rate = fn / (fn + tp)

print("Confusion Matrix:\n", cm_lr)
print(" ")
print("5-fold CV:")
print("Accuracy:", round(mean_cv_accuracy,4))
print("Precision:", round(mean_cv_precision,4))
print("Recall:", round(mean_cv_recall,4))
print("F1:", round(mean_cv_f1,4))
print("False Negative Rate:", round(false_negative_rate,4))

Confusion Matrix:
 [[ 79  21]
 [ 28 106]]
 
5-fold CV:
Accuracy: 0.8058
Precision: 0.8058
Recall: 0.8471
F1: 0.8375
False Negative Rate: 0.209


In [153]:
# Set up the Gradient Boosting Classifier with current learning rate
gbs_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42)
gbs_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),('classifier', gbs_model)])

# Create confusion matrix
gbs_pipeline.fit(X_train, y_train)
y_pred_gb = gbs_pipeline.predict(X_test)
cm_gb = confusion_matrix(y_test, y_pred_gb)
confusion_matrices.append(cm_gb)

# 5 fold CV
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate Scores
cv_scores_accuracy = cross_val_score(gbs_pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_accuracy = cv_scores_accuracy.mean()
cv_scores_precision = cross_val_score(gbs_pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_precision = cv_scores_precision.mean()
cv_scores_recall = cross_val_score(gbs_pipeline, X_train, y_train, cv=k_fold, scoring="recall")
mean_cv_recall = cv_scores_recall.mean()
cv_scores_f1 = cross_val_score(gbs_pipeline, X_train, y_train, cv=k_fold, scoring="f1")
mean_cv_f1 = cv_scores_f1.mean()

# Calculating false negative rate
tn, fp, fn, tp = cm_gb.ravel()
false_negative_rate = fn / (fn + tp)

print("Confusion Matrix:\n", cm_gb)
print(" ")
print("5-fold CV:")
print("Accuracy:", round(mean_cv_accuracy,4))
print("Precision:", round(mean_cv_precision,4))
print("Recall:", round(mean_cv_recall,4))
print("F1:", round(mean_cv_f1,4))
print("False Negative Rate:", round(false_negative_rate,4))

Confusion Matrix:
 [[ 83  17]
 [  8 126]]
 
5-fold CV:
Accuracy: 0.9084
Precision: 0.9084
Recall: 0.9449
F1: 0.925
False Negative Rate: 0.0597


In [154]:
# SVM model and pipeline
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
svm_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('classifier', svm_model)])

# Create confusion matrix
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
cm_svm = confusion_matrix(y_test, y_pred_svm)
confusion_matrices.append(cm_svm)

# 5 fold CV
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate Scores
cv_scores_accuracy = cross_val_score(svm_pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_accuracy = cv_scores_accuracy.mean()
cv_scores_precision = cross_val_score(svm_pipeline, X_train, y_train, cv=k_fold, scoring="accuracy")
mean_cv_precision = cv_scores_precision.mean()
cv_scores_recall = cross_val_score(svm_pipeline, X_train, y_train, cv=k_fold, scoring="recall")
mean_cv_recall = cv_scores_recall.mean()
cv_scores_f1 = cross_val_score(svm_pipeline, X_train, y_train, cv=k_fold, scoring="f1")
mean_cv_f1 = cv_scores_f1.mean()

# Calculating false negative rate
tn, fp, fn, tp = cm_svm.ravel()
false_negative_rate = fn / (fn + tp)

print("Confusion Matrix:\n", cm_svm)
print(" ")
print("5-fold CV:")
print("Accuracy:", round(mean_cv_accuracy,4))
print("Precision:", round(mean_cv_precision,4))
print("Recall:", round(mean_cv_recall,4))
print("F1:", round(mean_cv_f1,4))
print("False Negative Rate:", round(false_negative_rate,4))

Confusion Matrix:
 [[ 71  29]
 [ 21 113]]
 
5-fold CV:
Accuracy: 0.7966
Precision: 0.7966
Recall: 0.869
F1: 0.8358
False Negative Rate: 0.1567
