In [8]:
# ===============================
# ✈️ Flight Delay Prediction Pipeline (with Optuna + LightGBM/XGBoost)
# ===============================

import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

# -------------------------------
# 1. Load Data
# -------------------------------
df = pd.read_csv(r"D:\Big data analysis\splited dataset\dataset_part_1.csv")

print("Available columns:", df.columns.tolist())
print(df.head())

# -------------------------------
# 2. Target column
# -------------------------------
target_col = "ARR_DEL15"
assert target_col in df.columns, f"Target column '{target_col}' not found!"

# Drop rows with missing target
df = df.dropna(subset=[target_col])

X = df.drop(columns=[target_col])
y = df[target_col]

# -------------------------------
# 3. Identify Problem Type
# -------------------------------
is_numeric_target = np.issubdtype(y.dtype, np.number)
problem_type = "regression" if is_numeric_target and y.nunique() > 20 else "classification"
print("Problem type detected:", problem_type)

# -------------------------------
# 4. Preprocessing
# -------------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    sparse_threshold=0.3,
)

# -------------------------------
# 5. Train-Test Split
# -------------------------------
if problem_type == "classification":
    # Drop rare classes (<2 samples)
    class_counts = y.value_counts()
    valid_classes = class_counts[class_counts > 1].index
    X = X[y.isin(valid_classes)]
    y = y[y.isin(valid_classes)]

    y = y.astype(str) if not is_numeric_target else y
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

# -------------------------------
# 6. Define Models
# -------------------------------
if problem_type == "classification":
    base_model_lgb = LGBMClassifier(random_state=42)
    base_model_xgb = XGBClassifier(
        random_state=42, eval_metric="logloss", use_label_encoder=False
    )
else:
    base_model_lgb = LGBMRegressor(random_state=42)
    base_model_xgb = XGBRegressor(random_state=42)

# -------------------------------
# 7. Optuna Optimization
# -------------------------------
def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = (LGBMClassifier if problem_type == "classification" else LGBMRegressor)(**params, random_state=42)
    pipeline = Pipeline([("pre", preprocessor), ("model", model)])
    score = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro" if problem_type=="classification" else "r2").mean()
    return score

def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = (XGBClassifier if problem_type == "classification" else XGBRegressor)(**params, random_state=42)
    pipeline = Pipeline([("pre", preprocessor), ("model", model)])
    score = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro" if problem_type=="classification" else "r2").mean()
    return score

print("\n🔎 Optimizing LightGBM...")
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=20)
print("Best LightGBM params:", study_lgb.best_params)

print("\n🔎 Optimizing XGBoost...")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=20)
print("Best XGBoost params:", study_xgb.best_params)

# -------------------------------
# 8. Final Training & Evaluation
# -------------------------------
best_lgb = (LGBMClassifier if problem_type == "classification" else LGBMRegressor)(**study_lgb.best_params, random_state=42)
pipeline_lgb = Pipeline([("pre", preprocessor), ("model", best_lgb)])
pipeline_lgb.fit(X_train, y_train)

best_xgb = (XGBClassifier if problem_type == "classification" else XGBRegressor)(**study_xgb.best_params, random_state=42)
pipeline_xgb = Pipeline([("pre", preprocessor), ("model", best_xgb)])
pipeline_xgb.fit(X_train, y_train)

# -------------------------------
# 9. Evaluation
# -------------------------------
y_pred_lgb = pipeline_lgb.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)

if problem_type == "classification":
    print("\n📊 LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
    print("LightGBM F1:", f1_score(y_test, y_pred_lgb, average="macro"))
    print("LightGBM ROC-AUC:", roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_lgb), multi_class="ovr"))

    print("\n📊 XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
    print("XGBoost F1:", f1_score(y_test, y_pred_xgb, average="macro"))
    print("XGBoost ROC-AUC:", roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_xgb), multi_class="ovr"))

else:
    print("\n📊 LightGBM RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lgb)))
    print("📊 XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))


Available columns: ['MONTH', 'DAY_OF_MONTH', 'FL_DATE', 'MKT_CARRIER', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'CRS_DEP_1hrpre', 'CRS_DEP_1hrpost', 'DEP_1hrpre_num', 'DEP_1hrpost_num', 'Arr_1hrpre_num', 'Arr_1hrpost_num', 'max_temp_f', 'min_temp_f', 'max_dewpoint_f', 'min_dewpoint_f', 'precip_in', 'avg_wind_speed_kts', 'snow_in', 'avg_feel', 'FAA_class', 'scheduled_Turnarnd', 'Actual_Turnarnd', 'Diff_in_turnarnd', 'longTurnaround', 'Scheduled_DEP', 'Scheduled_ARR_Ori', 'Actual_ARR_dt_Ori', 'Scheduled_DEP_EST', 'Actual_DEP_dt_EST', 'Scheduled_ARR_EST', 'Actual_ARR_dt_EST', 'Scheduled_ARR_Local', 'Actual_ARR_dt_Local', 'late_airjet_when_turnaround_within_180', 'affected_turnaround_lessthan120', 'affected_turnaround_lessthan90', 'affected_turnaround_lessthan60

[I 2025-08-22 17:35:43,399] A new study created in memory with name: no-name-1a5c90e3-e3c3-4ff4-8e4f-b6a1dae74846



🔎 Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.360731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.234758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start trai

[I 2025-08-22 17:38:00,315] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 438, 'learning_rate': 0.06768864974136239, 'max_depth': 3, 'num_leaves': 34, 'subsample': 0.869272799826666, 'colsample_bytree': 0.6370456926406775}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.243215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.243182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:39:54,001] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 426, 'learning_rate': 0.25139262230763043, 'max_depth': 6, 'num_leaves': 83, 'subsample': 0.7811441345306431, 'colsample_bytree': 0.6675335979265411}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.191664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.176643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:41:35,751] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 296, 'learning_rate': 0.10827728921392896, 'max_depth': 5, 'num_leaves': 124, 'subsample': 0.7534398315400146, 'colsample_bytree': 0.8762724436195863}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.158675 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.159456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:43:24,656] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 477, 'learning_rate': 0.2152301662916704, 'max_depth': 9, 'num_leaves': 74, 'subsample': 0.8632195519790224, 'colsample_bytree': 0.7190053306506845}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.325170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.140343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=

[I 2025-08-22 17:45:07,747] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 456, 'learning_rate': 0.19634432343861646, 'max_depth': 4, 'num_leaves': 109, 'subsample': 0.6662377864437261, 'colsample_bytree': 0.8750430949378758}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.145186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.347646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=

[I 2025-08-22 17:47:52,527] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 201, 'learning_rate': 0.07851130760471856, 'max_depth': 7, 'num_leaves': 143, 'subsample': 0.5990589810926948, 'colsample_bytree': 0.6632277402246229}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.281859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.258824 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:49:40,048] Trial 6 finished with value: 1.0 and parameters: {'n_estimators': 410, 'learning_rate': 0.10402557406516663, 'max_depth': 4, 'num_leaves': 135, 'subsample': 0.9909619543050044, 'colsample_bytree': 0.758581668887707}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.287187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.213208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:51:34,806] Trial 7 finished with value: 1.0 and parameters: {'n_estimators': 347, 'learning_rate': 0.11578841863546467, 'max_depth': 9, 'num_leaves': 63, 'subsample': 0.580965993995026, 'colsample_bytree': 0.6784410002328249}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 61715, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.207427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28721
[LightGBM] [Info] Number of data points in the train set: 290448, number of used features: 12100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212482 -> initscore=-1.310028
[LightGBM] [Info] Start training from score -1.310028
[LightGBM] [Info] Number of positive: 61716, number of negative: 228733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.501307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28687
[LightGBM] [Info] Number of data points in the train set: 290449, number of used features: 12081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212485 -> initscore=-1.310012
[LightGBM] [Info] Start training from score -1.310012


[I 2025-08-22 17:54:11,764] Trial 8 finished with value: 1.0 and parameters: {'n_estimators': 241, 'learning_rate': 0.10412160677493651, 'max_depth': 6, 'num_leaves': 56, 'subsample': 0.6361254711045747, 'colsample_bytree': 0.7443053592549143}. Best is trial 0 with value: 1.0.
[W 2025-08-22 17:54:33,896] Trial 9 failed with parameters: {'n_estimators': 238, 'learning_rate': 0.08949764710119973, 'max_depth': 8, 'num_leaves': 97, 'subsample': 0.8329739980533335, 'colsample_bytree': 0.6703632128946557} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_7576\1147980643.py", line 116, in objective_lgb
    score = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro" if problem_type=="classification" else

KeyboardInterrupt: 

In [11]:
# ===============================
# ✈️ Flight Delay Prediction Pipeline (GPU + Memory-Efficient)
# ===============================

import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# -------------------------------
# 1. Load Data
# -------------------------------
df = pd.read_csv(r"D:\Big data analysis\splited dataset\dataset_part_1.csv")
print("Available columns:", df.columns.tolist())
print(df.head())

# -------------------------------
# 2. Target Column
# -------------------------------
target_col = "ARR_DEL15"
assert target_col in df.columns, f"Target column '{target_col}' not found!"

df = df.dropna(subset=[target_col])
X = df.drop(columns=[target_col])
y = df[target_col]

# -------------------------------
# 3. Identify Problem Type
# -------------------------------
is_numeric_target = np.issubdtype(y.dtype, np.number)
problem_type = "regression" if is_numeric_target and y.nunique() > 20 else "classification"
print("Problem type detected:", problem_type)

# -------------------------------
# 4. Preprocessing
# -------------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))  # fixed argument
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
], sparse_threshold=0.3)

# -------------------------------
# 5. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Further split for Optuna holdout validation
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# -------------------------------
# 6. Optuna Objectives
# -------------------------------
def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = LGBMRegressor(**params, random_state=42, device='gpu')
    pipeline = Pipeline([("pre", preprocessor), ("model", model)])
    pipeline.fit(X_train_sub, y_train_sub)
    y_pred = pipeline.predict(X_val)
    return r2_score(y_val, y_pred)

def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = XGBRegressor(**params, random_state=42, tree_method='gpu_hist', gpu_id=0)
    pipeline = Pipeline([("pre", preprocessor), ("model", model)])
    pipeline.fit(X_train_sub, y_train_sub)
    y_pred = pipeline.predict(X_val)
    return r2_score(y_val, y_pred)

# -------------------------------
# 7. Optuna Hyperparameter Tuning
# -------------------------------
print("\n🔎 Optimizing LightGBM...")
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=10)
print("Best LightGBM params:", study_lgb.best_params)

print("\n🔎 Optimizing XGBoost...")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=10)
print("Best XGBoost params:", study_xgb.best_params)

# -------------------------------
# 8. Final Training
# -------------------------------
best_lgb = LGBMRegressor(**study_lgb.best_params, random_state=42, device='gpu')
pipeline_lgb = Pipeline([("pre", preprocessor), ("model", best_lgb)])
pipeline_lgb.fit(X_train, y_train)

best_xgb = XGBRegressor(**study_xgb.best_params, random_state=42, tree_method='gpu_hist', gpu_id=0)
pipeline_xgb = Pipeline([("pre", preprocessor), ("model", best_xgb)])
pipeline_xgb.fit(X_train, y_train)

# -------------------------------
# 9. Predictions & Evaluation
# -------------------------------
y_pred_lgb = pipeline_lgb.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)

print("\n📊 LightGBM Metrics:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lgb)))
print("MAE:", mean_absolute_error(y_test, y_pred_lgb))
print("R2:", r2_score(y_test, y_pred_lgb))

print("\n📊 XGBoost Metrics:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("R2:", r2_score(y_test, y_pred_xgb))

# -------------------------------
# Train vs Test RMSE to check overfitting
# -------------------------------
train_pred_lgb = pipeline_lgb.predict(X_train)
train_pred_xgb = pipeline_xgb.predict(X_train)
print("\nTrain RMSE LightGBM:", np.sqrt(mean_squared_error(y_train, train_pred_lgb)))
print("Train RMSE XGBoost:", np.sqrt(mean_squared_error(y_train, train_pred_xgb)))

# -------------------------------
# 10. Feature Importance (LightGBM)
# -------------------------------
feature_names = num_cols + preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out().tolist()
importances = best_lgb.feature_importances_

plt.figure(figsize=(10,12))
plt.barh(feature_names, importances)
plt.title("LightGBM Feature Importances")
plt.show()

# -------------------------------
# 11. Save Models
# -------------------------------
joblib.dump(pipeline_lgb, "lgb_pipeline_gpu.pkl")
joblib.dump(pipeline_xgb, "xgb_pipeline_gpu.pkl")


Available columns: ['MONTH', 'DAY_OF_MONTH', 'FL_DATE', 'MKT_CARRIER', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'CRS_DEP_1hrpre', 'CRS_DEP_1hrpost', 'DEP_1hrpre_num', 'DEP_1hrpost_num', 'Arr_1hrpre_num', 'Arr_1hrpost_num', 'max_temp_f', 'min_temp_f', 'max_dewpoint_f', 'min_dewpoint_f', 'precip_in', 'avg_wind_speed_kts', 'snow_in', 'avg_feel', 'FAA_class', 'scheduled_Turnarnd', 'Actual_Turnarnd', 'Diff_in_turnarnd', 'longTurnaround', 'Scheduled_DEP', 'Scheduled_ARR_Ori', 'Actual_ARR_dt_Ori', 'Scheduled_DEP_EST', 'Actual_DEP_dt_EST', 'Scheduled_ARR_EST', 'Actual_ARR_dt_EST', 'Scheduled_ARR_Local', 'Actual_ARR_dt_Local', 'late_airjet_when_turnaround_within_180', 'affected_turnaround_lessthan120', 'affected_turnaround_lessthan90', 'affected_turnaround_lessthan60

[I 2025-08-22 18:02:39,088] A new study created in memory with name: no-name-3cba2693-8abf-4c83-b84b-c7482de1995b



🔎 Optimizing LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.021327 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:03:46,531] Trial 0 finished with value: 0.9999759109583661 and parameters: {'n_estimators': 418, 'learning_rate': 0.08081786565086824, 'max_depth': 10, 'num_leaves': 77, 'subsample': 0.5374225823538743, 'colsample_bytree': 0.7371387974401205}. Best is trial 0 with value: 0.9999759109583661.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.016190 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:04:25,630] Trial 1 finished with value: 0.9999999999972491 and parameters: {'n_estimators': 258, 'learning_rate': 0.10349854771023366, 'max_depth': 3, 'num_leaves': 70, 'subsample': 0.7739793053599122, 'colsample_bytree': 0.8852195435981047}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.016226 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:05:21,109] Trial 2 finished with value: 0.9999301518982041 and parameters: {'n_estimators': 272, 'learning_rate': 0.10991432391137075, 'max_depth': 12, 'num_leaves': 64, 'subsample': 0.5451884016551192, 'colsample_bytree': 0.6479145389138165}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.013969 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:06:06,380] Trial 3 finished with value: 0.9999213415875825 and parameters: {'n_estimators': 156, 'learning_rate': 0.0494572727794512, 'max_depth': 4, 'num_leaves': 88, 'subsample': 0.7567609954418939, 'colsample_bytree': 0.5974545378154269}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.015049 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:06:47,869] Trial 4 finished with value: 0.9995956986235597 and parameters: {'n_estimators': 119, 'learning_rate': 0.03232758569548363, 'max_depth': 10, 'num_leaves': 78, 'subsample': 0.610067672542209, 'colsample_bytree': 0.8692524281808771}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.015221 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:07:48,479] Trial 5 finished with value: 0.9998272509059123 and parameters: {'n_estimators': 288, 'learning_rate': 0.24698173413284852, 'max_depth': 9, 'num_leaves': 108, 'subsample': 0.5290849679620627, 'colsample_bytree': 0.5944350306469077}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.013595 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:08:36,113] Trial 6 finished with value: 0.9999603012598909 and parameters: {'n_estimators': 345, 'learning_rate': 0.14860900629239557, 'max_depth': 6, 'num_leaves': 42, 'subsample': 0.8258895418841569, 'colsample_bytree': 0.674276215920136}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.019256 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:09:19,924] Trial 7 finished with value: 0.9999853624275276 and parameters: {'n_estimators': 273, 'learning_rate': 0.1852357962421503, 'max_depth': 5, 'num_leaves': 135, 'subsample': 0.8527504165907394, 'colsample_bytree': 0.7089450671166493}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.016521 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:10:47,180] Trial 8 finished with value: 0.9999947257392074 and parameters: {'n_estimators': 159, 'learning_rate': 0.21846787312324026, 'max_depth': 12, 'num_leaves': 44, 'subsample': 0.7077331698390275, 'colsample_bytree': 0.8259677272981025}. Best is trial 1 with value: 0.9999999999972491.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 29862
[LightGBM] [Info] Number of data points in the train set: 348538, number of used features: 12622
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 31 dense feature groups (10.64 MB) transferred to GPU in 0.020453 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.211624


[I 2025-08-22 18:12:50,263] Trial 9 finished with value: 0.9998677485001323 and parameters: {'n_estimators': 297, 'learning_rate': 0.1921461571529116, 'max_depth': 12, 'num_leaves': 45, 'subsample': 0.7651390413541457, 'colsample_bytree': 0.5149055228839193}. Best is trial 1 with value: 0.9999999999972491.
[I 2025-08-22 18:12:50,273] A new study created in memory with name: no-name-360db70e-983f-46b7-a888-224d6fd460b5


Best LightGBM params: {'n_estimators': 258, 'learning_rate': 0.10349854771023366, 'max_depth': 3, 'num_leaves': 70, 'subsample': 0.7739793053599122, 'colsample_bytree': 0.8852195435981047}

🔎 Optimizing XGBoost...


[I 2025-08-22 18:18:13,794] Trial 0 finished with value: 0.9999572869716301 and parameters: {'n_estimators': 368, 'learning_rate': 0.06110247124609191, 'max_depth': 4, 'subsample': 0.7192732939668373, 'colsample_bytree': 0.6055046726650617}. Best is trial 0 with value: 0.9999572869716301.
[W 2025-08-22 18:19:03,205] Trial 1 failed with parameters: {'n_estimators': 293, 'learning_rate': 0.19306533696912237, 'max_depth': 11, 'subsample': 0.769121025794684, 'colsample_bytree': 0.7985550737659419} because of the following error: XGBoostError('[18:19:03] C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\common\\device_vector.cu:23: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory\n- Free memory: 0B\n- Requested memory: 6.43097GB\n').
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^

XGBoostError: [18:19:03] C:\actions-runner\_work\xgboost\xgboost\src\common\device_vector.cu:23: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 0B
- Requested memory: 6.43097GB
