<a href="https://colab.research.google.com/github/20134571/20134571.github.io/blob/main/Ungrouped_Features_Baseline_ML1109_With_SMOTE_5CV_with_repeats_2509AJB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 11 08:05:47 2025
@author: heidi
"""

# ============================ STABILITY ============================
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# ============================ Imports ============================
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, average_precision_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False
    print("ℹ️ Skipping XGBoost (import failed):", e)

# ============================ Config ============================
N_JOBS_PAR = 2
PCA_VARIANCE = 0.95
N_SPLITS = 5
N_REPEATS = 3  # Number of repeats
N_NEIGHBORS = 9

# ============================ Load Data ============================
url = "https://raw.githubusercontent.com/20134571/AISKILLSET/main/airline_satisfaction_mitigation_arrival_cleaned.csv"
df = pd.read_csv(url)
df = df.dropna().reset_index(drop=True)

target_col = "satisfaction"
drop_cols = ["Unnamed: 0", "id", "Arrival Delay in Minutes"]

X = df.drop(columns=[c for c in drop_cols if c in df.columns] + [target_col]).copy()
y = df[target_col].copy()

cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
if cat_cols:
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

def _sanitize(s: str) -> str:
    s = str(s)
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

X.columns = [_sanitize(c) for c in X.columns]

# ============================ Helper Functions ============================
def summarize(y_true, y_pred, name):
    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Prec_0": rep["0"]["precision"], "Rec_0": rep["0"]["recall"], "F1_0": rep["0"]["f1-score"],
        "Prec_1": rep["1"]["precision"], "Rec_1": rep["1"]["recall"], "F1_1": rep["1"]["f1-score"],
        "MacroF1": (rep["0"]["f1-score"] + rep["1"]["f1-score"]) / 2,
        "WeightedF1": rep["weighted avg"]["f1-score"],
    }

def plot_roc_pr(models, y_test, title_suffix=""):
    plt.figure(figsize=(12,5))

    plt.subplot(1,2,1)
    for name, model, X_in in models:
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_in)[:,1]
        elif hasattr(model, "decision_function"):
            y_score = model.decision_function(X_in)
        else:
            continue
        fpr, tpr, _ = roc_curve(y_test, y_score)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")
    plt.plot([0,1],[0,1],"k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve {title_suffix}")
    plt.legend(loc="lower right")

    plt.subplot(1,2,2)
    for name, model, X_in in models:
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_in)[:,1]
        elif hasattr(model, "decision_function"):
            y_score = model.decision_function(X_in)
        else:
            continue
        precision, recall, _ = precision_recall_curve(y_test, y_score)
        ap = average_precision_score(y_test, y_score)
        plt.plot(recall, precision, label=f"{name} (AP={ap:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve {title_suffix}")
    plt.legend(loc="lower left")
    plt.tight_layout()
    plt.show()

def plot_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4.8,4.2))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False)
    plt.title(title)
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.xticks([0.5,1.5], ["0 (Dissatisfied)","1 (Satisfied)"])
    plt.yticks([0.5,1.5], ["0 (Dissatisfied)","1 (Satisfied)"], rotation=0)
    plt.tight_layout(); plt.show()

# ============================ Cross-Validation with Repeats ============================
rows = []
smote = SMOTE(random_state=42, k_neighbors=5)
scaler = StandardScaler()
pca = PCA(n_components=PCA_VARIANCE, random_state=42)

rkf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=42)

fold_num = 0
for train_idx, test_idx in rkf.split(X, y):
    fold_num += 1
    repeat_num = ((fold_num-1) // N_SPLITS) + 1
    split_num = ((fold_num-1) % N_SPLITS) + 1
    print(f"\n===== Repeat {repeat_num}, Fold {split_num} =====")

    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

    # Apply SMOTE
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Scale
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # PCA
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # ==================== Models ====================
    knn_np = KNeighborsClassifier(n_neighbors=N_NEIGHBORS).fit(X_train_scaled, y_train_res)
    rows.append(summarize(y_test, knn_np.predict(X_test_scaled), f"KNN (Scaled, R{repeat_num}F{split_num})"))

    logreg_np = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_scaled, y_train_res)
    rows.append(summarize(y_test, logreg_np.predict(X_test_scaled), f"LogReg (Scaled, R{repeat_num}F{split_num})"))

    svm_np = SVC(kernel='rbf', probability=True, random_state=42).fit(X_train_scaled, y_train_res)
    rows.append(summarize(y_test, svm_np.predict(X_test_scaled), f"SVM (Scaled, R{repeat_num}F{split_num})"))

    knn_pca = KNeighborsClassifier(n_neighbors=N_NEIGHBORS).fit(X_train_pca, y_train_res)
    rows.append(summarize(y_test, knn_pca.predict(X_test_pca), f"KNN (PCA, R{repeat_num}F{split_num})"))

    logreg_pca = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_pca, y_train_res)
    rows.append(summarize(y_test, logreg_pca.predict(X_test_pca), f"LogReg (PCA, R{repeat_num}F{split_num})"))

    svm_pca = SVC(kernel='rbf', probability=True, random_state=42).fit(X_train_pca, y_train_res)
    rows.append(summarize(y_test, svm_pca.predict(X_test_pca), f"SVM (PCA, R{repeat_num}F{split_num})"))

    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=N_JOBS_PAR)
    rf.fit(X_train_res, y_train_res)
    rows.append(summarize(y_test, rf.predict(X_test), f"RF (Tabular, R{repeat_num}F{split_num})"))

    if HAS_XGB:
        xgb_tab = XGBClassifier(
            objective="binary:logistic", eval_metric="logloss", tree_method="hist",
            n_estimators=500, max_depth=6, learning_rate=0.1,
            subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
            scale_pos_weight=1, random_state=42, n_jobs=N_JOBS_PAR
        )
        xgb_tab.fit(X_train_res, y_train_res)
        rows.append(summarize(y_test, xgb_tab.predict(X_test), f"XGB (Tabular, R{repeat_num}F{split_num})"))

    # Save last fold/repeat for visuals
    if fold_num == N_SPLITS * N_REPEATS:
        last_rf, last_svm, last_knn, last_logreg = rf, svm_np, knn_np, logreg_np
        last_xgb = xgb_tab if HAS_XGB else None
        last_y_test, last_X_test, last_X_test_scaled, last_X_train_res = y_test, X_test, X_test_scaled, X_train_res
        last_scaler, last_pca = scaler, pca

# ============================ Results ============================
df_results = pd.DataFrame(rows)
df_results.to_csv("model_results_rkf.csv", index=False)

df_results_grouped = df_results.groupby(df_results["Model"].str.split(" R").str[0]).mean(numeric_only=True).round(3)
print("\n=== Average Results Across Folds & Repeats ===")
print(df_results_grouped.sort_values(["Accuracy","MacroF1"], ascending=False))

# ============================ Visualizations ============================
# 1) Accuracy / MacroF1 Leaderboard
df_plot = df_results[["Accuracy","MacroF1"]].copy().sort_values("Accuracy", ascending=False)
plt.figure(figsize=(10,5))
df_plot["Accuracy"].plot(kind="bar")
plt.title("Model Accuracy (higher is better, SMOTE applied)")
plt.ylabel("Accuracy")
plt.xticks(rotation=20, ha="right")
plt.tight_layout(); plt.show()

plt.figure(figsize=(10,5))
df_plot["MacroF1"].plot(kind="bar")
plt.title("Model Macro-F1 (averaged F1 of both classes, SMOTE applied)")
plt.ylabel("Macro-F1")
plt.xticks(rotation=20, ha="right")
plt.tight_layout(); plt.show()

# 2) Confusion Matrices for last fold/repeat
plot_cm(last_y_test, last_rf.predict(last_X_test), "Confusion Matrix — RF (Tabular Non-PCA, SMOTE)")
if last_xgb is not None:
    plot_cm(last_y_test, last_xgb.predict(last_X_test), "Confusion Matrix — XGB (Tabular Non-PCA, SMOTE)")
plot_cm(last_y_test, last_svm.predict(last_X_test_scaled), "Confusion Matrix — SVM (Scaled, SMOTE)")

# 3) Feature Importances
rf_imp = (pd.DataFrame({"Feature": last_X_train_res.columns, "Importance": last_rf.feature_importances_})
          .sort_values("Importance", ascending=False).head(15))
plt.figure(figsize=(8,6))
sns.barplot(data=rf_imp, x="Importance", y="Feature")
plt.title("Top 15 RF Feature Importances (SMOTE)")
plt.tight_layout(); plt.show()

if last_xgb is not None:
    xgb_imp = (pd.DataFrame({"Feature": last_X_train_res.columns, "Importance": last_xgb.feature_importances_})
               .sort_values("Importance", ascending=False).head(15))
    plt.figure(figsize=(8,6))
    sns.barplot(data=xgb_imp, x="Importance", y="Feature")
    plt.title("Top 15 XGB Feature Importances (SMOTE)")
    plt.tight_layout(); plt.show()

# 4) PCA Variance
cum = np.cumsum(last_pca.explained_variance_ratio_)
plt.figure(figsize=(7,4))
plt.plot(range(1, len(cum)+1), cum, marker='o')
plt.axhline(0.90, linestyle='--', label='90%')
plt.axhline(0.95, linestyle='--', label='95%')
plt.xlabel("Number of Components"); plt.ylabel("Cumulative Explained Variance")
plt.title("PCA — Cumulative Explained Variance (SMOTE)"); plt.legend(); plt.tight_layout(); plt.show()
