# Training

In [1]:
import pandas as pd

df = pd.read_csv("data/preprocessed_dataset.csv")
df.head()

Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure,search_dow,flight_dow,search_month,flight_month,search_week,flight_week,is_weekend_flight,days_into_summer,is_peak_travel_period
0,2022-04-17,2022-06-01,272.6,45,6,2,4,6,15,22,0,0,0
1,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
2,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
3,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
4,2022-04-17,2022-06-01,328.6,45,6,2,4,6,15,22,0,0,0


In [2]:
feature_cols = [
    "days_to_departure",
    "flight_dow",
    "search_dow",
    "is_weekend_flight",
    "days_into_summer",
    "is_peak_travel_period",
]

target_col = "totalFare"


## Split dataset based on late and early summer days

In [12]:
cutoff_date = pd.to_datetime("2022-07-20")

train = df[pd.to_datetime(df["searchDate"]) <= cutoff_date]
test = df[pd.to_datetime(df["searchDate"]) > cutoff_date]

X_train = train[feature_cols]
y_train = train[target_col]

X_test = test[feature_cols]
y_test = test[target_col]

X_train.shape, X_test.shape

((103854, 6), (26596, 6))

# Train and evaluate

In [16]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

param_grid = {
    "max_depth": [3, 4, 5],
    "learning_rate": [0.05, 0.1],
    "n_estimators": [200, 400, 600],
}

best_mae = float("inf")
best_params = None

for md in param_grid["max_depth"]:
    for lr in param_grid["learning_rate"]:
        for ne in param_grid["n_estimators"]:
            model = XGBRegressor(
                max_depth=md,
                learning_rate=lr,
                n_estimators=ne,
                subsample=0.8,
                colsample_bytree=0.8,
                objective="reg:squarederror",
                random_state=42,
                n_jobs=-1,
            )

            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            mae = mean_absolute_error(y_test, preds)

            print(f"depth={md}, lr={lr}, est={ne} → MAE={mae:.2f}")

            if mae < best_mae:
                best_mae = mae
                best_params = (md, lr, ne)

print(best_mae, best_params)


depth=3, lr=0.05, est=200 → MAE=246.56
depth=3, lr=0.05, est=400 → MAE=236.45
depth=3, lr=0.05, est=600 → MAE=227.70
depth=3, lr=0.1, est=200 → MAE=240.77
depth=3, lr=0.1, est=400 → MAE=230.25
depth=3, lr=0.1, est=600 → MAE=243.16
depth=4, lr=0.05, est=200 → MAE=259.23
depth=4, lr=0.05, est=400 → MAE=250.68
depth=4, lr=0.05, est=600 → MAE=249.22
depth=4, lr=0.1, est=200 → MAE=252.43
depth=4, lr=0.1, est=400 → MAE=273.33
depth=4, lr=0.1, est=600 → MAE=278.13
depth=5, lr=0.05, est=200 → MAE=255.03
depth=5, lr=0.05, est=400 → MAE=261.11
depth=5, lr=0.05, est=600 → MAE=271.59
depth=5, lr=0.1, est=200 → MAE=268.02
depth=5, lr=0.1, est=400 → MAE=276.85
depth=5, lr=0.1, est=600 → MAE=280.42
227.69585196825665 (3, 0.05, 600)
