In [2]:
# =======================================================
# ETA Prediction Hyperparameter Tuning with XGBoost
# =======================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [4]:
seg_df = pd.read_parquet(r"C:\Users\moham\OneDrive\Desktop\Bengaluru Vega\Code\task_1\ref_data\clean_segments.parquet")

In [None]:
# =======================================================
# ETA Prediction Hyperparameter Tuning with XGBoost
# =======================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
from xgboost import XGBRegressor
import optuna

# ----------------------------
# Preprocessing function
# ----------------------------
def preprocess_seg_df(seg_df):
    df = seg_df.copy()

    # Label encode categorical IDs
    encoders = {}
    for col in ['route_id', 'from_stop', 'to_stop']:
        le = LabelEncoder()
        df[f"{col}_enc"] = le.fit_transform(df[col].astype(str))
        encoders[col] = le

    # Ensure start_hour and day_of_week exist as numeric
    if 'start_hour' not in df.columns:
        df['start_hour'] = pd.to_datetime(df['tA']).dt.hour
    if 'day_of_week' not in df.columns:
        df['day_of_week'] = pd.to_datetime(df['tA']).dt.dayofweek

    # Target
    y = df['travel_time_min']

    # Features
    feature_cols = ['route_id_enc','from_stop_enc','to_stop_enc','distance_m',
                    'avg_speed_m_s','start_hour','day_of_week']
    X = df[feature_cols]

    return X, y, encoders


# ----------------------------
# Objective function for Optuna
# ----------------------------
def objective(trial, seg_df):
    X, y, _ = preprocess_seg_df(seg_df)

    params = {
    "n_estimators": trial.suggest_int("n_estimators", 800, 1400),        # more estimators for lower LR
    "max_depth": trial.suggest_int("max_depth", 3, 6),                   # shallow to prevent overfitting
    "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.02, log=True),  # lower learning rate
    "subsample": trial.suggest_float("subsample", 0.7, 0.9),             # moderate subsample
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.9), # slightly lower colsample
    "gamma": trial.suggest_float("gamma", 0, 2),                         # mild regularization
    "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 1.0, log=True),    # stronger L1 regularization
    "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 5.0, log=True),  # stronger L2 regularization
    "min_child_weight": trial.suggest_int("min_child_weight", 4, 8),     # moderate to avoid overfitting small nodes
    "tree_method": "hist",
    "random_state": 42,
    "n_jobs": 14
}
    model = XGBRegressor(**params)

    tscv = TimeSeriesSplit(n_splits=3)
    rmses = []

    for train_idx, valid_idx in tscv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

        preds = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, preds))  # <-- fixed here
        rmses.append(rmse)

    return np.mean(rmses)


# ----------------------------
# Hyperparameter tuning
# ----------------------------
def tune_hyperparams(seg_df, n_trials=50):
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, seg_df), n_trials=n_trials)

    print("\nBest Trial:")
    print("  RMSE:", study.best_value)
    print("  Params:", study.best_params)
    return study.best_params


# ----------------------------
# Train final model
# ----------------------------
def train_final_model(seg_df, best_params):
    X, y, encoders = preprocess_seg_df(seg_df)
    model = XGBRegressor(**best_params)
    model.fit(X, y)
    return model, encoders


# =======================================================
# Example usage
# =======================================================
if __name__ == "__main__":
    # seg_df = pd.read_csv("your_cleaned_seg_df.csv")  # load your dataframe
    best_params = tune_hyperparams(seg_df, n_trials=75)
    final_model, encoders = train_final_model(seg_df, best_params)
    print("\nFinal model trained with best parameters!")


These were the best weights : Best Trial:
  RMSE: 2.990990335020095
  Params: {'n_estimators': 843, 'max_depth': 4, 'learning_rate': 0.0070734825829311945, 'subsample': 0.7770865778757821, 'colsample_bytree': 0.8959516946395695, 'gamma': 1.8847572995409152, 'reg_alpha': 0.8692519934006419, 'reg_lambda': 1.005542707759953, 'min_child_weight': 6}