In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
import sys
import joblib
import optuna
import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, recall_score, roc_auc_score, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)

from xgboost import XGBClassifier, XGBRegressor

# Access to src/
ROOT = Path("..").resolve()
sys.path.append(str(ROOT))

from src.mlflow_utils import setup_mlflow
from src.config import (
    DATA_PROCESSED_DIR, MODELS_DIR, THRESHOLD_HIGH_DELAY, RANDOM_STATE
)

setup_mlflow()

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)

print("Notebook READY")

[MLflow] Creating experiment: us_flights_delay
──────────────────────────
[MLflow] Tracking URI : file:/app/mlruns
[MLflow] Experiment : us_flights_delay (ID=740368804610069892)
──────────────────────────
Notebook READY


In [2]:
train = pd.read_csv(DATA_PROCESSED_DIR / "train.csv")
val   = pd.read_csv(DATA_PROCESSED_DIR / "val.csv")
test  = pd.read_csv(DATA_PROCESSED_DIR / "test.csv")

train.shape, val.shape, test.shape

((119998, 31), (25714, 31), (25714, 31))

In [3]:
CATEGORICAL_COLS = ["carrier", "carrier_name", "airport", "airport_name"]

NUMERIC_COLS_CLF = [
    "year", "month", "arr_flights",
    "delay_rate", "avg_delay_per_flight",
    "cancel_rate", "divert_rate",
    "carrier_delay_per_flight",
    "weather_delay_per_flight",
    "nas_delay_per_flight",
    "security_delay_per_flight",
    "late_aircraft_delay_per_flight"
]

TARGET_CLF = "high_delay_risk"

NUMERIC_COLS_REG = [
    "year", "month", "arr_flights",
    "delay_rate",
    "cancel_rate", "divert_rate",
    "carrier_delay_per_flight",
    "weather_delay_per_flight",
    "nas_delay_per_flight",
    "security_delay_per_flight",
    "late_aircraft_delay_per_flight"
]

TARGET_REG = "avg_delay_per_flight"

In [4]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor_clf = ColumnTransformer([
    ("cat", cat_pipeline, CATEGORICAL_COLS),
    ("num", num_pipeline, NUMERIC_COLS_CLF)
])

preprocessor_reg = ColumnTransformer([
    ("cat", cat_pipeline, CATEGORICAL_COLS),
    ("num", num_pipeline, NUMERIC_COLS_REG)
])

In [5]:
def get_clf_data():
    return (
        train[CATEGORICAL_COLS + NUMERIC_COLS_CLF], train[TARGET_CLF],
        val[CATEGORICAL_COLS + NUMERIC_COLS_CLF],   val[TARGET_CLF],
        test[CATEGORICAL_COLS + NUMERIC_COLS_CLF],  test[TARGET_CLF]
    )

def get_reg_data():
    return (
        train[CATEGORICAL_COLS + NUMERIC_COLS_REG], train[TARGET_REG],
        val[CATEGORICAL_COLS + NUMERIC_COLS_REG],   val[TARGET_REG],
        test[CATEGORICAL_COLS + NUMERIC_COLS_REG],  test[TARGET_REG]
    )

def eval_reg(y_true, y_pred):
    return {
        "mae":  mean_absolute_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
        "r2":   r2_score(y_true, y_pred)
    }

In [6]:
X_train, y_train, X_val, y_val, X_test, y_test = get_clf_data()

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=RANDOM_STATE
)

pipe_logreg = Pipeline([
    ("preprocessor", preprocessor_clf),
    ("classifier", log_reg)
])

with mlflow.start_run(run_name="baseline_logreg"):
    pipe_logreg.fit(X_train, y_train)

    y_val_pred  = pipe_logreg.predict(X_val)
    y_val_proba = pipe_logreg.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_val_pred)
    rec = recall_score(y_val, y_val_pred)
    auc = roc_auc_score(y_val, y_val_proba)

    mlflow.log_params({"model":"logreg"})
    mlflow.log_metrics({"val_acc":acc,"val_rec":rec,"val_auc":auc})

    MODELS_DIR.mkdir(exist_ok=True)
    path_lr = MODELS_DIR/"logreg_baseline.joblib"
    joblib.dump(pipe_logreg, path_lr)
    mlflow.log_artifact(str(path_lr))

acc, rec, auc

(0.9979388659873999, 0.9973352464896997, 0.9999807826261374)

In [7]:
X_train_r, y_train_r, X_val_r, y_val_r, X_test_r, y_test_r = get_reg_data()

lin_reg = LinearRegression()

pipe_linreg = Pipeline([
    ("preprocessor", preprocessor_reg),
    ("regressor", lin_reg)
])

with mlflow.start_run(run_name="baseline_linreg"):
    pipe_linreg.fit(X_train_r, y_train_r)

    y_pred_val = pipe_linreg.predict(X_val_r)
    metrics = eval_reg(y_val_r, y_pred_val)

    mlflow.log_metrics(metrics)

    path_lr_reg = MODELS_DIR/"linreg_baseline.joblib"
    joblib.dump(pipe_linreg, path_lr_reg)
    mlflow.log_artifact(str(path_lr_reg))

metrics

{'mae': 2.283080104724543e-05,
 'rmse': 0.00030452921911005497,
 'r2': 0.9999999992069446}

In [8]:
XGB_CLF_SPACE = {
    "n_estimators": (300, 800),
    "max_depth": (3, 9),
    "learning_rate": (0.01, 0.2),
    "subsample": (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0),
    "min_child_weight": (1, 12),
    "gamma": (0.0, 5.0)
}

In [9]:
def objective_xgb_clf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", *XGB_CLF_SPACE["n_estimators"]),
        "max_depth": trial.suggest_int("max_depth", *XGB_CLF_SPACE["max_depth"]),
        "learning_rate": trial.suggest_float("learning_rate", *XGB_CLF_SPACE["learning_rate"], log=True),
        "subsample": trial.suggest_float("subsample", *XGB_CLF_SPACE["subsample"]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", *XGB_CLF_SPACE["colsample_bytree"]),
        "min_child_weight": trial.suggest_int("min_child_weight", *XGB_CLF_SPACE["min_child_weight"]),
        "gamma": trial.suggest_float("gamma", *XGB_CLF_SPACE["gamma"]),
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
        "eval_metric": "logloss"
    }

    model = XGBClassifier(**params)
    pipe = Pipeline([
        ("preprocessor", preprocessor_clf),
        ("classifier", model)
    ])

    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_val)[:, 1]
    return roc_auc_score(y_val, proba)

In [10]:
study_clf = optuna.create_study(direction="maximize")
study_clf.optimize(objective_xgb_clf, n_trials=25)

study_clf.best_params, study_clf.best_value

[I 2025-12-08 18:59:00,367] A new study created in memory with name: no-name-4a69cf96-911f-4775-b321-c7fabbb57405
[I 2025-12-08 18:59:10,807] Trial 0 finished with value: 0.9999942257957546 and parameters: {'n_estimators': 714, 'max_depth': 3, 'learning_rate': 0.0173782807779963, 'subsample': 0.9359530623065841, 'colsample_bytree': 0.8496887646602237, 'min_child_weight': 8, 'gamma': 0.0075674527033159356}. Best is trial 0 with value: 0.9999942257957546.
[I 2025-12-08 18:59:17,232] Trial 1 finished with value: 0.9999937633455814 and parameters: {'n_estimators': 434, 'max_depth': 9, 'learning_rate': 0.03956816555640904, 'subsample': 0.674202972408012, 'colsample_bytree': 0.6187126691505483, 'min_child_weight': 6, 'gamma': 4.02248632385139}. Best is trial 0 with value: 0.9999942257957546.
[I 2025-12-08 18:59:24,932] Trial 2 finished with value: 0.9999940844915349 and parameters: {'n_estimators': 539, 'max_depth': 4, 'learning_rate': 0.07073493876122235, 'subsample': 0.8489160835264126, 'c

({'n_estimators': 797,
  'max_depth': 6,
  'learning_rate': 0.057353619802128775,
  'subsample': 0.8332287058274315,
  'colsample_bytree': 0.7412708773687853,
  'min_child_weight': 1,
  'gamma': 3.472562210524953},
 0.9999947781667947)

In [11]:
best_clf_params = study_clf.best_params.copy()
best_clf_params.update({
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
    "eval_metric": "logloss"
})

xgb_clf = XGBClassifier(**best_clf_params)

pipe_xgb_clf = Pipeline([
    ("preprocessor", preprocessor_clf),
    ("classifier", xgb_clf)
])

with mlflow.start_run(run_name="xgb_classifier_optuna"):
    pipe_xgb_clf.fit(X_train, y_train)

    val_proba = pipe_xgb_clf.predict_proba(X_val)[:, 1]
    val_pred  = (val_proba >= 0.5).astype(int)

    metrics = {
        "acc": accuracy_score(y_val, val_pred),
        "rec": recall_score(y_val, val_pred),
        "auc": roc_auc_score(y_val, val_proba),
    }

    mlflow.log_params(best_clf_params)
    mlflow.log_metrics(metrics)

    path_clf = MODELS_DIR/"xgb_classifier_optuna.joblib"
    joblib.dump(pipe_xgb_clf, path_clf)
    mlflow.log_artifact(str(path_clf))

metrics

{'acc': 0.9987166524072489,
 'rec': 0.998462642205596,
 'auc': 0.9999947781667947}

In [12]:
XGB_REG_SPACE = XGB_CLF_SPACE  # même search space

In [13]:
def objective_xgb_reg(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", *XGB_REG_SPACE["n_estimators"]),
        "max_depth": trial.suggest_int("max_depth", *XGB_REG_SPACE["max_depth"]),
        "learning_rate": trial.suggest_float("learning_rate", *XGB_REG_SPACE["learning_rate"], log=True),
        "subsample": trial.suggest_float("subsample", *XGB_REG_SPACE["subsample"]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", *XGB_REG_SPACE["colsample_bytree"]),
        "min_child_weight": trial.suggest_int("min_child_weight", *XGB_REG_SPACE["min_child_weight"]),
        "gamma": trial.suggest_float("gamma", *XGB_REG_SPACE["gamma"]),
        "random_state": RANDOM_STATE,
        "n_jobs": -1
    }

    model = XGBRegressor(**params)
    pipe = Pipeline([
        ("preprocessor", preprocessor_reg),
        ("regressor", model)
    ])

    pipe.fit(X_train_r, y_train_r)
    pred = pipe.predict(X_val_r)

    rmse = np.sqrt(mean_squared_error(y_val_r, pred))
    return -rmse  # minimisation

In [14]:
study_reg = optuna.create_study(direction="maximize")
study_reg.optimize(objective_xgb_reg, n_trials=25)

study_reg.best_params, study_reg.best_value

[I 2025-12-08 19:01:49,614] A new study created in memory with name: no-name-443a3424-92d5-441b-9252-1cc959d3a84a
[I 2025-12-08 19:01:57,437] Trial 0 finished with value: -4.96137997691394 and parameters: {'n_estimators': 519, 'max_depth': 3, 'learning_rate': 0.020175292951666028, 'subsample': 0.9311059489476211, 'colsample_bytree': 0.7098096058189693, 'min_child_weight': 8, 'gamma': 2.886257968862437}. Best is trial 0 with value: -4.96137997691394.
[I 2025-12-08 19:02:05,307] Trial 1 finished with value: -4.984388851248342 and parameters: {'n_estimators': 350, 'max_depth': 3, 'learning_rate': 0.027591805490018127, 'subsample': 0.8481882021081653, 'colsample_bytree': 0.6120942491941856, 'min_child_weight': 11, 'gamma': 0.39965313208868647}. Best is trial 0 with value: -4.96137997691394.
[I 2025-12-08 19:02:30,971] Trial 2 finished with value: -5.036027133735411 and parameters: {'n_estimators': 609, 'max_depth': 9, 'learning_rate': 0.022399459460005718, 'subsample': 0.818177815027454, '

({'n_estimators': 665,
  'max_depth': 5,
  'learning_rate': 0.1993307949631522,
  'subsample': 0.7373637391705539,
  'colsample_bytree': 0.6443353874455423,
  'min_child_weight': 1,
  'gamma': 1.6937325634106157},
 -4.633253365325228)

In [15]:
best_reg_params = study_reg.best_params.copy()
best_reg_params.update({
    "random_state": RANDOM_STATE,
    "n_jobs": -1
})

xgb_reg = XGBRegressor(**best_reg_params)

pipe_xgb_reg = Pipeline([
    ("preprocessor", preprocessor_reg),
    ("regressor", xgb_reg)
])

with mlflow.start_run(run_name="xgb_regressor_optuna"):
    pipe_xgb_reg.fit(X_train_r, y_train_r)

    y_pred_val = pipe_xgb_reg.predict(X_val_r)
    metrics = eval_reg(y_val_r, y_pred_val)

    mlflow.log_params(best_reg_params)
    mlflow.log_metrics(metrics)

    path_reg = MODELS_DIR/"xgb_regressor_optuna.joblib"
    joblib.dump(pipe_xgb_reg, path_reg)
    mlflow.log_artifact(str(path_reg))

metrics

{'mae': 0.5081514188601832,
 'rmse': 4.633253365325228,
 'r2': 0.8164232415805031}

In [16]:
print("TRAINING PIPELINE COMPLETE")

TRAINING PIPELINE COMPLETE
