In [9]:
from pathlib import Path
import warnings, json, joblib, optuna, xgboost as xgb
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
from tqdm.auto import tqdm
from datetime import datetime

In [10]:
DATA_DIR  = Path("/Users/aayush2683/Projects/Solvus AI Intern/Data/ML Data")
MODEL_DIR = Path("/Users/aayush2683/Projects/Solvus AI Intern/Prediction/Models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [11]:
df = pd.read_pickle(DATA_DIR / "train_features.pkl")
y_raw = df.pop("Price")
df    = df.drop(columns=["RowID"])

cat_cols = ["Airline","Source","Destination","Additional_Info"]
num_cols = [c for c in df.columns if c not in cat_cols]

pre = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
     ("num", StandardScaler(),                        num_cols)]
)

In [12]:
def safe_split(X, y, strat, seed=42):
    if strat.value_counts().min() >= 2:
        return train_test_split(X, y, test_size=0.2,
                                random_state=seed, stratify=strat)
    return train_test_split(X, y, test_size=0.2, random_state=seed)

X_tr, X_val, y_tr, y_val = safe_split(df, y_raw, strat=df["Airline"])

In [13]:
# ── Optuna objective: maximise R² on hold-out -------------------
def objective(trial):
    params = {
        "n_estimators":      trial.suggest_int("n_estimators", 600, 2000),
        "learning_rate":     trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth":         trial.suggest_int("max_depth", 4, 12),
        "subsample":         trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree":  trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight":  trial.suggest_int("min_child_weight", 1, 10),
        "gamma":             trial.suggest_float("gamma", 0.0, 5.0),
        "reg_lambda":        trial.suggest_float("reg_lambda", 0.1, 10.0, log=True),
        "random_state":      42,
        "n_jobs":            -1,
        "tree_method":       "hist",
    }
    model = Pipeline([
        ("prep", pre),
        ("xgb",  xgb.XGBRegressor(**params))
    ])
    model.fit(X_tr, y_tr)
    pred = model.predict(X_val)
    return r2_score(y_val, pred)  # Optuna will maximise this

N_TRIALS = 100
optuna.logging.set_verbosity(optuna.logging.WARNING)
pbar = tqdm(total=N_TRIALS, desc="Optuna tuning", ncols=80)

def cb(study, trial):
    pbar.set_postfix(best=f"{study.best_value:.3f}")
    pbar.update(1)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS, callbacks=[cb])
pbar.close()

best = study.best_params
print("🏆  Best params:", best)

Optuna tuning:   0%|                                    | 0/100 [00:00<?, ?it/s]

🏆  Best params: {'n_estimators': 1430, 'learning_rate': 0.05249726204105458, 'max_depth': 7, 'subsample': 0.9635178567146764, 'colsample_bytree': 0.6464568339269249, 'min_child_weight': 2, 'gamma': 3.5958136431836976, 'reg_lambda': 5.845374141273329}


In [14]:
def make_pipe(title):
    reg = xgb.XGBRegressor(**best, callbacks=[Bar(best["n_estimators"], title)])
    return Pipeline([("prep", pre), ("xgb", reg)])

class Bar(xgb.callback.TrainingCallback):
    def __init__(self, total, title):
        self.total, self.title = total, title; self.pbar = None
    def after_iteration(self, model, epoch, ev):
        if self.pbar is None:
            self.pbar = tqdm(total=self.total, desc=self.title,
                             unit="tree", leave=False, ncols=80)
        self.pbar.update(1); return False
    def after_training(self, model):
        if self.pbar: self.pbar.close(); return model

print("🔧  Training best model on 80 % split …")
pipe = make_pipe("Fit 80 %")
pipe.fit(X_tr, y_tr)
pipe.named_steps["xgb"].callbacks.clear()

val_pred = pipe.predict(X_val)
r2   = r2_score (y_val, val_pred)
mae  = mean_absolute_error(y_val, val_pred)
print(f"Hold-out  R²={r2:.3f}  MAE=₹{mae:,.0f}")

🔧  Training best model on 80 % split …


Fit 80 %:   0%|                                      | 0/1430 [00:00<?, ?tree/s]

Hold-out  R²=0.919  MAE=₹674


In [15]:
print("Re-training on 100 % …")
pipe_full = make_pipe("Full fit")
pipe_full.fit(df, y_raw)
pipe_full.named_steps["xgb"].callbacks.clear()

Re-training on 100 % …


Full fit:   0%|                                      | 0/1430 [00:00<?, ?tree/s]

In [16]:
joblib.dump(pipe_full, MODEL_DIR/"flight_price_xgb.joblib")
json.dump(
    {"r2_val":float(r2),"mae_val":float(mae),
     "best_params":best,"trained":datetime.now().isoformat()},
    open(MODEL_DIR/"metrics.json","w"), indent=2
)
print("Model & metrics saved →", MODEL_DIR)

Model & metrics saved → /Users/aayush2683/Projects/Solvus AI Intern/Prediction/Models
