# 02 ‚Äî Load Data (SQL JOIN) + EDA + Train/Test Split

This notebook:
- loads the normalized DB using a JOIN query
- runs basic EDA (class balance, missing values, correlations)
- decides whether to stratify (we stratify by the label)
- trains a baseline model pipeline (Logistic Regression)
- evaluates with **3-fold CV F1** and test-set F1
- saves the model to `models/global_best_model.pkl`


In [12]:
# 02 ‚Äî Train Models (No Optuna) + Log 8 Experiments to DagsHub/MLflow

from pathlib import Path
import json
import sqlite3
from datetime import datetime

import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
import joblib

# DagsHub + MLflow
import dagshub
dagshub.init(repo_owner='Aayushnepal09', repo_name='my-first-repo', mlflow=True)

import mlflow

SEED = 42
np.random.seed(SEED)

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

import sys
sys.path.append(str(PROJECT_ROOT))

DB_PATH = PROJECT_ROOT / "data" / "airline.db"
MODELS_DIR = PROJECT_ROOT / "models"
METRICS_DIR = PROJECT_ROOT / "models" / "metrics"

MODELS_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DB_PATH:", DB_PATH, "exists=", DB_PATH.exists())

PROJECT_ROOT: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp
DB_PATH: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\data\airline.db exists= True


In [13]:
print("sys.path:", sys.path)

sys.path: ['C:\\Users\\nepal\\AppData\\Local\\Programs\\Python\\Python312\\python312.zip', 'C:\\Users\\nepal\\AppData\\Local\\Programs\\Python\\Python312\\DLLs', 'C:\\Users\\nepal\\AppData\\Local\\Programs\\Python\\Python312\\Lib', 'C:\\Users\\nepal\\AppData\\Local\\Programs\\Python\\Python312', 'c:\\Users\\nepal\\OneDrive\\Desktop\\airline_satisfaction_appp\\.venv', '', 'c:\\Users\\nepal\\OneDrive\\Desktop\\airline_satisfaction_appp\\.venv\\Lib\\site-packages', 'c:\\Users\\nepal\\OneDrive\\Desktop\\airline_satisfaction_appp', 'c:\\Users\\nepal\\OneDrive\\Desktop\\airline_satisfaction_appp']


## Class balance
We stratify the train/test split by `satisfaction_binary` to preserve this distribution.


In [14]:
# Load training dataframe from the normalized DB
if not DB_PATH.exists():
    raise FileNotFoundError(
        f"Database not found at {DB_PATH}. Run 01_create_database.ipynb first."
    )

conn = sqlite3.connect(DB_PATH)

query = '''
SELECT
    p.gender,
    p.customer_type,
    p.age,
    t.type_of_travel,
    t.travel_class,
    t.flight_distance,
    s.inflight_wifi_service,
    s.departure_arrival_time_convenient,
    s.ease_of_online_booking,
    s.gate_location,
    s.food_and_drink,
    s.online_boarding,
    s.seat_comfort,
    s.inflight_entertainment,
    s.on_board_service,
    s.leg_room_service,
    s.baggage_handling,
    s.checkin_service,
    s.inflight_service,
    s.cleanliness,
    d.departure_delay_minutes,
    d.arrival_delay_minutes,
    sat.satisfaction_binary
FROM trip t
JOIN passenger p ON t.passenger_id = p.passenger_id
JOIN service_rating s ON s.trip_id = t.trip_id
JOIN delay d ON d.trip_id = t.trip_id
JOIN satisfaction sat ON sat.trip_id = t.trip_id
;
'''
df = pd.read_sql_query(query, conn)
conn.close()

display(df.head())
display(df["satisfaction_binary"].value_counts())


Unnamed: 0,gender,customer_type,age,type_of_travel,travel_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_minutes,arrival_delay_minutes,satisfaction_binary
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18,0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6,0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0,1
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9,0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0,1


satisfaction_binary
0    58879
1    45025
Name: count, dtype: int64

## Missing values & basic checks

In [15]:
# Split
TARGET = "satisfaction_binary"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (83123, 22) Test size: (20781, 22)


## Correlation matrix (numerical only)
This helps spot redundant features and strong relationships.


In [16]:
# Preprocessing (shared)
from housing_pipeline import build_preprocessing

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

def make_estimator(model_name: str, params: dict | None = None):
    params = params or {}
    if model_name == "logreg":
        return LogisticRegression(max_iter=2000, random_state=SEED, **params)
    if model_name == "ridge":
        return RidgeClassifier(random_state=SEED, **params)
    if model_name == "hgb":
        return HistGradientBoostingClassifier(random_state=SEED, **params)
    if model_name == "rf":
        return RandomForestClassifier(
            random_state=SEED,
            n_jobs=-1,
            **params
        )
    raise ValueError(f"Unknown model_name: {model_name}")

def build_model_pipeline(model_name: str, use_pca: bool):
    preprocess = build_preprocessing(use_pca=use_pca)
    est = make_estimator(model_name)
    return Pipeline([("preprocess", preprocess), ("model", est)])


## Data-profiling (optional)
If you have `ydata-profiling` installed, you can run a full report.


In [17]:
# Run 8 experiments:
# 4 algorithms √ó (PCA on/off) √ó (no tuning)

EXPERIMENTS = []
for model_name in ["logreg", "ridge", "hgb", "rf"]:
    for use_pca in [False, True]:
        EXPERIMENTS.append(
            dict(model_name=model_name, use_pca=use_pca, tuned=False)
        )

mlflow.set_experiment("airline_satisfaction_no_optuna")

results = []
best = None  # track best by test_f1

for exp in EXPERIMENTS:
    model_name = exp["model_name"]
    use_pca = exp["use_pca"]

    run_name = f"{model_name}__pca_{int(use_pca)}__notuned"
    pipe = build_model_pipeline(model_name, use_pca)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(
            {
                "model_name": model_name,
                "use_pca": int(use_pca),
                "tuned": 0,
                "cv_folds": cv.get_n_splits(),
                "seed": SEED,
            }
        )

        # CV F1
        cv_scores = cross_val_score(
            pipe, X_train, y_train,
            cv=cv,
            scoring="f1",
            n_jobs=-1
        )
        cv_f1_mean = float(np.mean(cv_scores))
        cv_f1_std = float(np.std(cv_scores))

        # Fit + test
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        test_f1 = float(f1_score(y_test, y_pred))

        mlflow.log_metrics({"cv_f1_mean": cv_f1_mean, "cv_f1_std": cv_f1_std, "test_f1": test_f1})

        # Save artifacts
        stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        model_path = MODELS_DIR / f"{model_name}__pca_{int(use_pca)}__notuned.pkl"
        metrics_path = METRICS_DIR / f"{model_name}__pca_{int(use_pca)}__notuned.metrics.json"

        joblib.dump(pipe, model_path)

        metrics = {
            "model_name": model_name,
            "use_pca": bool(use_pca),
            "tuned": False,
            "cv_f1_mean": cv_f1_mean,
            "cv_f1_std": cv_f1_std,
            "test_f1": test_f1,
            "model_path": str(model_path),
            "created_utc": stamp,
        }
        with open(metrics_path, "w", encoding="utf-8") as f:
            json.dump(metrics, f, indent=2)

        mlflow.log_artifact(str(model_path))
        mlflow.log_artifact(str(metrics_path))

        row = {
            "model_name": model_name,
            "use_pca": use_pca,
            "tuned": False,
            "cv_f1_mean": cv_f1_mean,
            "cv_f1_std": cv_f1_std,
            "test_f1": test_f1,
            "model_path": str(model_path),
        }
        results.append(row)

        if best is None or test_f1 > best["test_f1"]:
            best = row

results_df = pd.DataFrame(results).sort_values(["test_f1"], ascending=False)
display(results_df)
print("Best (no optuna):", best)


2025/12/19 20:45:52 INFO mlflow.tracking.fluent: Experiment with name 'airline_satisfaction_no_optuna' does not exist. Creating a new experiment.
  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run logreg__pca_0__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/1fc1dc72f69a40c58f7a9f12f155682a
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run logreg__pca_1__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/49e3a1af303645aeb34a2e9ac063ba63
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run ridge__pca_0__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/cf3f76c212b94176b7b879cd265a5832
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run ridge__pca_1__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/8fe71a98228d4ca19fefe21b9f631f0b
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run hgb__pca_0__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/8bc5748ee9b04592b0e96e635d19724c
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run hgb__pca_1__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/b7bb8040694a46e29826b288be98eaa2
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run rf__pca_0__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/652f04788d1e4bc9a4af12a9dee0fff6
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


  stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


üèÉ View run rf__pca_1__notuned at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0/runs/0547e42c7e5c49eb9e163602b176c055
üß™ View experiment at: https://dagshub.com/Aayushnepal09/my-first-repo.mlflow/#/experiments/0


Unnamed: 0,model_name,use_pca,tuned,cv_f1_mean,cv_f1_std,test_f1,model_path
4,hgb,False,False,0.957226,0.002441,0.95923,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
6,rf,False,False,0.955692,0.002696,0.957724,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
5,hgb,True,False,0.924573,0.004511,0.926849,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
7,rf,True,False,0.920246,0.003845,0.92243,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
0,logreg,False,False,0.853054,0.003637,0.855388,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
2,ridge,False,False,0.849578,0.004225,0.852873,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
1,logreg,True,False,0.850743,0.004345,0.852756,c:\Users\nepal\OneDrive\Desktop\airline_satisf...
3,ridge,True,False,0.846922,0.005103,0.849952,c:\Users\nepal\OneDrive\Desktop\airline_satisf...


Best (no optuna): {'model_name': 'hgb', 'use_pca': False, 'tuned': False, 'cv_f1_mean': 0.9572262900714119, 'cv_f1_std': 0.002441258394213917, 'test_f1': 0.9592296429778128, 'model_path': 'c:\\Users\\nepal\\OneDrive\\Desktop\\airline_satisfaction_appp\\models\\hgb__pca_0__notuned.pkl'}


### Data-cleanup task list (example)
- Impute missing `arrival_delay_minutes` (median)
- Ensure delay minutes are non-negative (clip at 0)
- Standardize categorical values (strip whitespace)
- Validate service score ranges (1‚Äì5)


In [18]:
# Save the best no-optuna model in the same filename used by the API
best_model_path = Path(best["model_path"])
best_pipe = joblib.load(best_model_path)

GLOBAL_BEST_PATH = MODELS_DIR / "global_best_model.pkl"
joblib.dump(best_pipe, GLOBAL_BEST_PATH)

print("‚úÖ Saved global best (no optuna) to:", GLOBAL_BEST_PATH)


‚úÖ Saved global best (no optuna) to: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\models\global_best_model.pkl


In [19]:
# Optional: also save a small summary file
summary_path = METRICS_DIR / "summary_no_optuna.json"
summary = {
    "best_model_path": str(GLOBAL_BEST_PATH),
    "best_test_f1": float(best["test_f1"]),
    "n_experiments": len(results_df),
}
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

mlflow.end_run()
print("Saved summary:", summary_path)


Saved summary: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\models\metrics\summary_no_optuna.json


In [20]:
# Quick check: load the global model and score it
pipe = joblib.load(GLOBAL_BEST_PATH)
pred = pipe.predict(X_test)
print("Global best test F1:", f1_score(y_test, pred))


Global best test F1: 0.9592296429778128


In [21]:
print('‚úÖ Done')


‚úÖ Done
