## Hyperparameter Tuning & Export Final Model

In [3]:
import pandas as pd
import joblib, json
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score
)

In [4]:

# Load data
data = pd.read_csv('../data/selected_features.csv')
y = data['target']
X = data.drop(columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------ Random Forest ------------------
rf = RandomForestClassifier(random_state=42)
rf_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 4, 6, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
rf_gs = RandomizedSearchCV(
    rf, rf_grid, n_iter=12, scoring='roc_auc',
    cv=5, random_state=42, n_jobs=-1
)
rf_gs.fit(X_train, y_train)
rf_best = rf_gs.best_estimator_
rf_probs = rf_best.predict_proba(X_test)
rf_auc = roc_auc_score(y_test, rf_probs, multi_class="ovr")
print("RF best AUC:", rf_auc)
print("RF best params:", rf_gs.best_params_)

# ------------------ SVM ------------------
svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])
svm_grid = {
    "svc__C": [0.1, 1, 10, 50],
    "svc__gamma": ['scale', 0.1, 0.01]
}
svm_gs = GridSearchCV(
    svm_pipe, svm_grid, scoring='roc_auc',
    cv=5, n_jobs=-1
)
svm_gs.fit(X_train, y_train)
svm_best = svm_gs.best_estimator_
svm_probs = svm_best.predict_proba(X_test)
svm_auc = roc_auc_score(y_test, svm_probs, multi_class="ovr")
print("SVM best AUC:", svm_auc)
print("SVM best params:", svm_gs.best_params_)

# ------------------ Choose Best Model ------------------
if svm_auc >= rf_auc:
    best_model = svm_best
    best_name = "SVM"
    best_auc = svm_auc
    best_params = svm_gs.best_params_
else:
    best_model = rf_best
    best_name = "RandomForest"
    best_auc = rf_auc
    best_params = rf_gs.best_params_

print(f"Selected best model: {best_name} (AUC={best_auc:.3f})")

# ------------------ Evaluate with More Metrics ------------------
preds = best_model.predict(X_test)
acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds, average="weighted", zero_division=0)
rec = recall_score(y_test, preds, average="weighted", zero_division=0)
f1 = f1_score(y_test, preds, average="weighted", zero_division=0)

print(f"Accuracy: {acc:.3f}\nPrecision: {prec:.3f}\nRecall: {rec:.3f}\nF1: {f1:.3f}")

# ------------------ Save Model & Metrics ------------------
Path("../models").mkdir(parents=True, exist_ok=True)

joblib.dump(best_model, '../models/final_model.pkl')
print("Saved ../models/final_model.pkl")

results = {
    "best_model": best_name,
    "auc": best_auc,
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "best_params": best_params
}
with open("../models/final_model_metrics.json", "w") as f:
    json.dump(results, f, indent=4)
print("Saved ../models/final_model_metrics.json")




RF best AUC: 0.7761120233534027
RF best params: {'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 6}
SVM best AUC: 0.7892952845366639
SVM best params: {'svc__C': 0.1, 'svc__gamma': 'scale'}
Selected best model: SVM (AUC=0.789)
Accuracy: 0.541
Precision: 0.293
Recall: 0.541
F1: 0.380
Saved ../models/final_model.pkl
Saved ../models/final_model_metrics.json


