# Libraries

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import joblib
import pandas as pd
from xgboost import XGBClassifier


# Load Data

In [22]:
# Non PCA Data
# X_train = joblib.load("../data/X_train.pkl")
# X_test = joblib.load("../data/X_test.pkl")
# y_train = joblib.load("../data/y_train.pkl")
# y_test = joblib.load("../data/y_test.pkl")

# PCA Data
train_df = pd.read_csv("../data/pca_train.csv")
test_df = pd.read_csv("../data/pca_test.csv")

X_train = train_df.drop(columns=["has_disease"])
y_train = train_df["has_disease"]

X_test = test_df.drop(columns=["has_disease"])
y_test = test_df["has_disease"]

# Logistic Regression Hyperparameter Tuning

In [23]:
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

grid_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000, random_state=42),
    param_grid=param_grid_lr,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
print(classification_report(y_test, grid_lr.best_estimator_.predict(X_test)))

Best Logistic Regression Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.82      0.76      0.78        82
           1       0.81      0.86      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.81       184
weighted avg       0.82      0.82      0.81       184



# Random Forest Hyperparameter Tuning

In [24]:
param_dist_rf = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42
)

rand_rf.fit(X_train, y_train)

print("Best Random Forest Params:", rand_rf.best_params_)
print(classification_report(y_test, rand_rf.best_estimator_.predict(X_test)))

Best Random Forest Params: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}
              precision    recall  f1-score   support

           0       0.83      0.77      0.80        82
           1       0.82      0.87      0.85       102

    accuracy                           0.83       184
   macro avg       0.83      0.82      0.82       184
weighted avg       0.83      0.83      0.83       184



# SVM Hyperparameter Tuning (GridSearchCV)

In [25]:
param_grid_svc = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

grid_svc = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=param_grid_svc,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid_svc.fit(X_train, y_train)

print("Best SVM Params:", grid_svc.best_params_)
print(classification_report(y_test, grid_svc.best_estimator_.predict(X_test)))

Best SVM Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.89      0.78      0.83        82
           1       0.84      0.92      0.88       102

    accuracy                           0.86       184
   macro avg       0.86      0.85      0.85       184
weighted avg       0.86      0.86      0.86       184



# Decision Tree Hyperparameter Tuning (GridSearchCV)

In [26]:
param_grid_tree = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "criterion": ["gini", "entropy"]
}

grid_tree = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid_tree,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid_tree.fit(X_train, y_train)

print("Best Decision Tree Params:", grid_tree.best_params_)
print(classification_report(y_test, grid_tree.best_estimator_.predict(X_test)))


Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10}
              precision    recall  f1-score   support

           0       0.75      0.76      0.75        82
           1       0.80      0.79      0.80       102

    accuracy                           0.78       184
   macro avg       0.77      0.78      0.77       184
weighted avg       0.78      0.78      0.78       184



# XGBoost Hyperparameter Tuning (RandomizedSearchCV)

In [27]:
param_dist_xgb = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.3, 0.5],
    "reg_alpha": [0, 0.1, 0.5],
    "reg_lambda": [1, 1.5, 2]
}

rand_xgb = RandomizedSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42),
    param_distributions=param_dist_xgb,
    n_iter=30,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42
)

rand_xgb.fit(X_train, y_train)

print("Best XGBoost Params:", rand_xgb.best_params_)
print(classification_report(y_test, rand_xgb.best_estimator_.predict(X_test)))

Best XGBoost Params: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
              precision    recall  f1-score   support

           0       0.83      0.76      0.79        82
           1       0.82      0.87      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.82       184
weighted avg       0.82      0.82      0.82       184



# Compare Optimized Models

In [28]:
tuned_models = {
    "Logistic Regression": grid_lr.best_estimator_,
    "Decision Tree": grid_tree.best_estimator_,
    "Random Forest": rand_rf.best_estimator_,
    "SVM": grid_svc.best_estimator_,
    "XGBoost": rand_xgb.best_estimator_
}

accuracies =  []

print("\nTuned Model Performance:\n")
for name, model in sorted(tuned_models.items()):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    accuracies.append((name, acc))

# print sorted accuracies
accuracies.sort(key=lambda x: x[1], reverse=True)
for name, acc in accuracies:
    print(f"{name}: Accuracy = {acc:.4f}")

# for name, model in tuned_models.items():
#     model.fit(X_train, y_train)
#     joblib.dump(model, f"../models/all_models/{name}_tuned.pkl")
#     joblib.dump(model, f"../models/all_models/{name}_PCA_tuned.pkl")
#     print(f"Saved {name} model.")


Tuned Model Performance:

SVM: Accuracy = 0.8587
Random Forest: Accuracy = 0.8261
XGBoost: Accuracy = 0.8207
Logistic Regression: Accuracy = 0.8152
Decision Tree: Accuracy = 0.7772


# Best Model

In [29]:
# load all models in ../models/all_models/
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


models = {}

for name in ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "XGBoost"]:
    models[name] = joblib.load(f"../models/all_models/{name}.pkl")

for name in ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "XGBoost"]:
    models[name] = joblib.load(f"../models/all_models/{name}_PCA.pkl")

for name in ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "XGBoost"]:
    models[name] = joblib.load(f"../models/all_models/{name}_tuned.pkl")

for name in ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "XGBoost"]:
    models[name] = joblib.load(f"../models/all_models/{name}_PCA_tuned.pkl")

best_model_name = None
best_acc = 0
for name, model in models.items():
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    if acc > best_acc:
        best_acc = acc
        best_model_name = name

best_model = models[best_model_name]
y_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]) if hasattr(best_model, "predict_proba") else "N/A"

print(f"\nPerformance Metrics for Best Model ({best_model_name}):")
print(f"Accuracy     : {best_acc:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1-Score     : {f1:.4f}")
print(f"ROC-AUC      : {roc_auc if roc_auc != 'N/A' else 'Not supported by model'}")



Performance Metrics for Best Model (SVM):
Accuracy     : 0.8587
Precision    : 0.8393
Recall       : 0.9216
F1-Score     : 0.8785
ROC-AUC      : Not supported by model


# Save the best model for deployment

In [None]:
# joblib.dump(models[best_model_name], f"../models/best_model.pkl")

['../models/best_model.pkl']