<a href="https://colab.research.google.com/github/20134571/20134571.github.io/blob/main/Airline_Satisfaction_ML_Mitigation_with_RF_optimiser_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 11 08:05:47 2025

@author: heidi
Using Delay Mitigation
"""

# ============================ STABILITY (Windows/Spyder) ============================
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# ============================ Imports ============================
import re
import numpy as np
import pandas as pd
from google.colab import drive

from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# XGBoost (optional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False
    print("ℹ️ Skipping XGBoost (import failed):", e)

# ============================ Config ============================
N_JOBS_PAR = 2          # keep modest to avoid freezes
PCA_VARIANCE = 0.95     # keep ~95% variance for PCA

# ============================ 1) Load & basic clean ============================
import pandas as pd
from google.colab import files
url = "https://raw.githubusercontent.com/20134571/AISKILLSET/main/airline_satisfaction_mitigation_arrival_cleaned.csv"
df = pd.read_csv(url)
print(df.head())

df = df.dropna().reset_index(drop=True)

target_col = "satisfaction"
drop_cols = ["Unnamed: 0", "id", "Arrival Delay in Minutes"]

# ============================ 2) Build X, y ============================
X = df.drop(columns=[c for c in drop_cols if c in df.columns] + [target_col]).copy()
y = df[target_col].copy()

# ---- One-hot encode categorical ----
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
if cat_cols:
    preview = cat_cols[:10]
    print(f"One-hot encoding {len(cat_cols)} categorical columns: {preview}{' ...' if len(cat_cols)>10 else ''}")
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
else:
    print("No categorical columns detected — skipping one-hot encoding.")

# ---- Sanitize column names ----
def _sanitize(s: str) -> str:
    s = str(s)
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

X.columns = [_sanitize(c) for c in X.columns]
print(f"Features after encoding: {X.shape[1]}")

# ============================ 3) Unified split ============================
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(sss.split(X, y))

X_train_tab = X.iloc[train_idx].copy()
X_test_tab  = X.iloc[test_idx].copy()
y_train = y.iloc[train_idx].copy()
y_test  = y.iloc[test_idx].copy()

# Non-PCA (scaled) view
scaler = StandardScaler().fit(X_train_tab)
X_train_scaled = scaler.transform(X_train_tab)
X_test_scaled  = scaler.transform(X_test_tab)

# PCA view
pca = PCA(n_components=PCA_VARIANCE, random_state=42).fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca  = pca.transform(X_test_scaled)

print(f"PCA reduced features: {X_train_pca.shape[1]}")

# ============================ 4) Train models ============================
def summarize(y_true, y_pred, name):
    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Prec_0": rep["0"]["precision"], "Rec_0": rep["0"]["recall"], "F1_0": rep["0"]["f1-score"],
        "Prec_1": rep["1"]["precision"], "Rec_1": rep["1"]["recall"], "F1_1": rep["1"]["f1-score"],
        "MacroF1": (rep["0"]["f1-score"] + rep["1"]["f1-score"]) / 2,
        "WeightedF1": rep["weighted avg"]["f1-score"],
    }

rows = []

# ---------- Non-PCA ----------
knn_np = KNeighborsClassifier(n_neighbors=9).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, knn_np.predict(X_test_scaled), "KNN (Non-PCA, Scaled)"))

logreg_np = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, logreg_np.predict(X_test_scaled), "LogReg (Non-PCA, Scaled)"))

svm_np = SVC(kernel='rbf', random_state=42).fit(X_train_scaled, y_train)
rows.append(summarize(y_test, svm_np.predict(X_test_scaled), "SVM (Non-PCA, Scaled)"))

# ---------- PCA ----------
knn_pca = KNeighborsClassifier(n_neighbors=9).fit(X_train_pca, y_train)
rows.append(summarize(y_test, knn_pca.predict(X_test_pca), "KNN (PCA)"))

logreg_pca = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_pca, y_train)
rows.append(summarize(y_test, logreg_pca.predict(X_test_pca), "LogReg (PCA)"))

svm_pca = SVC(kernel='rbf', random_state=42).fit(X_train_pca, y_train)
rows.append(summarize(y_test, svm_pca.predict(X_test_pca), "SVM (PCA)"))

# ---------- Random Forest (baseline) ----------
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=N_JOBS_PAR)
rf.fit(X_train_tab, y_train)
rows.append(summarize(y_test, rf.predict(X_test_tab), "RF (Baseline, Tabular Non-PCA)"))

# ---------- 🔹 Random Forest (Hyperparameter Tuning) ----------
rf_param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [None, 10, 20, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=N_JOBS_PAR)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=rf_param_grid,
    n_iter=20,
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=N_JOBS_PAR,
    verbose=2
)

rf_search.fit(X_train_tab, y_train)
print("Best RF parameters:", rf_search.best_params_)
print("Best RF CV score:", rf_search.best_score_)

rf_tuned = rf_search.best_estimator_
rows.append(summarize(y_test, rf_tuned.predict(X_test_tab), "RF (Tuned, Tabular Non-PCA)"))

# ---------- XGBoost ----------
if HAS_XGB:
    neg, pos = int((y_train == 0).sum()), int((y_train == 1).sum())
    spw = neg / max(pos, 1)

    xgb_tab = XGBClassifier(
        objective="binary:logistic", eval_metric="logloss", tree_method="hist",
        n_estimators=500, max_depth=6, learning_rate=0.1,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        scale_pos_weight=spw, random_state=42, n_jobs=N_JOBS_PAR
    )
    xgb_tab.fit(X_train_tab, y_train)
    rows.append(summarize(y_test, xgb_tab.predict(X_test_tab), "XGB (Tabular Non-PCA)"))

    xgb_pca = XGBClassifier(
        objective="binary:logistic", eval_metric="logloss", tree_method="hist",
        n_estimators=500, max_depth=6, learning_rate=0.1,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        scale_pos_weight=spw, random_state=42, n_jobs=N_JOBS_PAR
    )
    xgb_pca.fit(X_train_pca, y_train)
    rows.append(summarize(y_test, xgb_pca.predict(X_test_pca), "XGB (PCA)"))

# ============================ 5) Results table ============================
df_results = pd.DataFrame(rows).set_index("Model").round(3)
print("\n=== Results (Both PCA & Non-PCA; RF Baseline & Tuned) ===")
print(df_results.sort_values(["Accuracy", "MacroF1"], ascending=False))


   Gender      Customer Type  Age   Type of Travel     Class  Flight Distance  \
0    male     loyal customer   13  personal travel  eco plus              460   
1    male  disloyal customer   25  business travel  business              235   
2  female     loyal customer   26  business travel  business             1142   
3  female     loyal customer   25  business travel  business              562   
4    male     loyal customer   61  business travel  business              214   

   Inflight wifi service  Departure/Arrival time convenient  \
0                      3                                  4   
1                      3                                  2   
2                      2                                  2   
3                      2                                  5   
4                      3                                  3   

   Ease of Online booking  Gate location  ...  Leg room service  \
0                       3              1  ...                 3   




Best RF parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': False}
Best RF CV score: 0.9619360357326862

=== Results (Both PCA & Non-PCA; RF Baseline & Tuned) ===
                                Accuracy  Prec_0  Rec_0   F1_0  Prec_1  Rec_1  \
Model                                                                           
XGB (Tabular Non-PCA)              0.963   0.962  0.974  0.968   0.965  0.949   
RF (Baseline, Tabular Non-PCA)     0.962   0.955  0.979  0.966   0.971  0.939   
RF (Tuned, Tabular Non-PCA)        0.962   0.955  0.979  0.967   0.972  0.940   
SVM (Non-PCA, Scaled)              0.951   0.949  0.966  0.957   0.955  0.932   
SVM (PCA)                          0.943   0.940  0.960  0.950   0.947  0.921   
XGB (PCA)                          0.941   0.943  0.953  0.948   0.938  0.925   
KNN (Non-PCA, Scaled)              0.926   0.912  0.961  0.936   0.946  0.879   
KNN (PCA)                