<a href="https://colab.research.google.com/github/20134571/20134571.github.io/blob/main/Airline_Satisfaction_Mitigation_Hyperparamters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 11 08:05:47 2025

@author: heidi
Using Delay Mitigation + Hyperparameter Tuning
"""

# ============================ STABILITY (Windows/Spyder) ============================
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# ============================ Imports ============================
import re
import numpy as np
import pandas as pd
from google.colab import drive

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# XGBoost (optional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False
    print("ℹ️ Skipping XGBoost (import failed):", e)

# ============================ Config ============================
N_JOBS_PAR = 2          # keep modest to avoid freezes
PCA_VARIANCE = 0.95     # keep ~95% variance for PCA

# ============================ 1) Load & basic clean ============================
url = "https://raw.githubusercontent.com/20134571/AISKILLSET/main/airline_satisfaction_mitigation_arrival_cleaned.csv"
df = pd.read_csv(url)
print(df.head())

df = df.dropna().reset_index(drop=True)

target_col = "satisfaction"
drop_cols = ["Unnamed: 0", "id", "Arrival Delay in Minutes"]

# ============================ 2) Build X, y ============================
X = df.drop(columns=[c for c in drop_cols if c in df.columns] + [target_col]).copy()
y = df[target_col].copy()

# One-hot encode categorical columns
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
if cat_cols:
    print(f"One-hot encoding {len(cat_cols)} categorical columns")
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Sanitize column names for XGBoost
def _sanitize(s: str) -> str:
    s = str(s)
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s
X.columns = [_sanitize(c) for c in X.columns]
print(f"Features after encoding: {X.shape[1]}")

# ============================ 3) Unified split ============================
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(sss.split(X, y))

X_train_tab = X.iloc[train_idx].copy()
X_test_tab  = X.iloc[test_idx].copy()
y_train = y.iloc[train_idx].copy()
y_test  = y.iloc[test_idx].copy()

# Scaled
scaler = StandardScaler().fit(X_train_tab)
X_train_scaled = scaler.transform(X_train_tab)
X_test_scaled  = scaler.transform(X_test_tab)

# PCA
pca = PCA(n_components=PCA_VARIANCE, random_state=42).fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca  = pca.transform(X_test_scaled)

print(f"PCA reduced features:    {X_train_pca.shape[1]}")

# ============================ 4) Training + Hypertuning ============================
def summarize(y_true, y_pred, name):
    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Prec_0": rep["0"]["precision"], "Rec_0": rep["0"]["recall"], "F1_0": rep["0"]["f1-score"],
        "Prec_1": rep["1"]["precision"], "Rec_1": rep["1"]["recall"], "F1_1": rep["1"]["f1-score"],
        "MacroF1": (rep["0"]["f1-score"] + rep["1"]["f1-score"]) / 2,
        "WeightedF1": rep["weighted avg"]["f1-score"],
    }

rows = []

# ---------- Baseline models (no tuning, quick) ----------
knn_np = KNeighborsClassifier(n_neighbors=9).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, knn_np.predict(X_test_scaled), "KNN (Non-PCA, Scaled)"))

logreg_np = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, logreg_np.predict(X_test_scaled), "LogReg (Non-PCA, Scaled)"))

svm_np = SVC(kernel="rbf", random_state=42).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, svm_np.predict(X_test_scaled), "SVM (Non-PCA, Scaled)"))

# ---------- Hypertuned Random Forest ----------
rf = RandomForestClassifier(random_state=42, n_jobs=N_JOBS_PAR)
rf_params = {
    "n_estimators": [200, 300, 500],
    "max_depth": [None, 10, 20, 40],
    "min_samples_split": [2, 5, 10],
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, n_jobs=N_JOBS_PAR, verbose=1)
rf_grid.fit(X_train_tab, y_train)
best_rf = rf_grid.best_estimator_
rows.append(summarize(y_test, best_rf.predict(X_test_tab), "RF (Tuned, Tabular Non-PCA)"))
print("Best RF params:", rf_grid.best_params_)

# ---------- Hypertuned Logistic Regression (PCA) ----------
logreg = LogisticRegression(max_iter=2000, random_state=42)
logreg_params = {"C": [0.1, 1, 10], "penalty": ["l2"]}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=3, n_jobs=N_JOBS_PAR, verbose=1)
logreg_grid.fit(X_train_pca, y_train)
best_logreg = logreg_grid.best_estimator_
rows.append(summarize(y_test, best_logreg.predict(X_test_pca), "LogReg (Tuned, PCA)"))
print("Best LogReg params:", logreg_grid.best_params_)

# ---------- Hypertuned SVM (PCA) ----------
svm = SVC(probability=True, random_state=42)
svm_params = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
svm_grid = GridSearchCV(svm, svm_params, cv=3, n_jobs=N_JOBS_PAR, verbose=1)
svm_grid.fit(X_train_pca, y_train)
best_svm = svm_grid.best_estimator_
rows.append(summarize(y_test, best_svm.predict(X_test_pca), "SVM (Tuned, PCA)"))
print("Best SVM params:", svm_grid.best_params_)

# ---------- Hypertuned KNN (PCA) ----------
knn = KNeighborsClassifier()
knn_params = {"n_neighbors": [3, 5, 9, 15]}
knn_grid = GridSearchCV(knn, knn_params, cv=3, n_jobs=N_JOBS_PAR, verbose=1)
knn_grid.fit(X_train_pca, y_train)
best_knn = knn_grid.best_estimator_
rows.append(summarize(y_test, best_knn.predict(X_test_pca), "KNN (Tuned, PCA)"))
print("Best KNN params:", knn_grid.best_params_)

# ---------- Hypertuned XGBoost ----------
if HAS_XGB:
    neg, pos = int((y_train == 0).sum()), int((y_train == 1).sum())
    spw = neg / max(pos, 1)

    xgb = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        scale_pos_weight=spw,
        random_state=42,
        n_jobs=N_JOBS_PAR,
    )
    xgb_params = {
        "n_estimators": [200, 300, 500],
        "max_depth": [None, 10, 20, 40],
        "learning_rate": [0.01, 0.1],
        "subsample": [0.8, 1.0],
    }
    xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, n_jobs=N_JOBS_PAR, verbose=1)
    xgb_grid.fit(X_train_tab, y_train)
    best_xgb = xgb_grid.best_estimator_
    rows.append(summarize(y_test, best_xgb.predict(X_test_tab), "XGB (Tuned, Tabular Non-PCA)"))
    print("Best XGB params:", xgb_grid.best_params_)

# ============================ 5) Results ============================
df_results = pd.DataFrame(rows).set_index("Model").round(3)
print("\n=== Results (with Hypertuning) ===")
print(df_results.sort_values(["Accuracy", "MacroF1"], ascending=False))

# Save to CSV
df_results.to_csv("results_hypertuned.csv")
