In [28]:
# !pip install lightgbm

In [29]:
# !pip install mlflow dagshub catboost optuna

In [30]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
import mlflow
import dagshub
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
import optuna

In [6]:
df = pd.read_csv("/content/food_delivery_interim.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38055 entries, 0 to 38054
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   rider_age            38055 non-null  float64
 1   rider_ratings        38055 non-null  float64
 2   weather              38055 non-null  object 
 3   traffic_density      38055 non-null  object 
 4   vehicle_condition    38055 non-null  int64  
 5   order_type           38055 non-null  object 
 6   vehicle_type         38055 non-null  object 
 7   multiple_deliveries  38055 non-null  float64
 8   festival             38055 non-null  object 
 9   time_taken           38055 non-null  int64  
 10  city_type            38055 non-null  object 
 11  day_name             38055 non-null  object 
 12  time_of_day          38055 non-null  object 
 13  distance             38055 non-null  float64
dtypes: float64(4), int64(2), object(8)
memory usage: 4.1+ MB


In [8]:
df.head()

Unnamed: 0,rider_age,rider_ratings,weather,traffic_density,vehicle_condition,order_type,vehicle_type,multiple_deliveries,festival,time_taken,city_type,day_name,time_of_day,distance
0,36.0,4.2,fog,jam,2,snack,motorcycle,3.0,no,46,metropolitian,saturday,dinner_peak,10.280582
1,21.0,4.7,stormy,high,1,meal,motorcycle,1.0,no,23,metropolitian,sunday,afternoon,6.242319
2,23.0,4.7,sandstorms,medium,1,drinks,scooter,1.0,no,21,metropolitian,friday,evening_snacks,13.78786
3,34.0,4.3,sandstorms,low,0,buffet,motorcycle,0.0,no,20,metropolitian,sunday,breakfast,2.930258
4,24.0,4.7,fog,jam,1,snack,scooter,1.0,no,41,metropolitian,monday,evening_snacks,19.396618


In [9]:
X = df.drop(columns='time_taken')
y = df['time_taken']

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
num_cols = ["rider_age","rider_ratings","distance"]

nominal_cat_cols = ["weather","order_type","vehicle_type","festival","city_type","day_name","time_of_day"]

ordinal_cat_cols = ["traffic_density"]

In [12]:
traffic_order = ["low","medium","high","jam"]

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), num_cols),
        (
            "nominal_encoder",
            OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
            nominal_cat_cols
        ),
        (
            "ordinal_encoder",
            OrdinalEncoder(categories=[traffic_order]),
            ordinal_cat_cols
        )
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [14]:
pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [15]:
y_train_pt = np.ravel(y_train_pt)
y_test_pt  = np.ravel(y_test_pt)

In [16]:
## pipeline

preprocessing_pipeline = Pipeline(
    steps=[
        ("preprocessor",preprocessor)
    ]
)

preprocessing_pipeline

In [17]:
X_train_trans = preprocessing_pipeline.fit_transform(X_train)
X_test_trans = preprocessing_pipeline.transform(X_test)

X_train_trans

Unnamed: 0,rider_age,rider_ratings,distance,weather_fog,weather_sandstorms,weather_stormy,weather_sunny,weather_windy,order_type_drinks,order_type_meal,...,day_name_tuesday,day_name_wednesday,time_of_day_breakfast,time_of_day_dinner_peak,time_of_day_evening_snacks,time_of_day_late_night,time_of_day_lunch_peak,traffic_density,vehicle_condition,multiple_deliveries
6965,-0.282097,1.164633,-1.211247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,0.0
14052,1.454428,0.849370,0.717574,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0,3.0
25717,-0.455749,0.218843,0.144911,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2,1.0
35085,1.280776,-1.988001,-0.385292,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2,1.0
5921,0.759818,-0.726947,0.689886,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,-1.671317,1.164633,-1.203960,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1.0
6265,-0.976707,-1.672737,-0.932219,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
11284,-0.976707,-1.672737,0.653365,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0,1.0
860,1.454428,-1.357474,0.757606,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2,2.0


In [18]:
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import optuna


In [19]:
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [31]:
def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0),
        "random_state": 42,
        "n_jobs": -1,
        "objective": "reg:squarederror",
        "tree_method": "hist"   # CPU optimized
    }

    xgb_model = XGBRegressor(**params)

    model = TransformedTargetRegressor(
        regressor=xgb_model,
        transformer=pt
    )

    cv_score = cross_val_score(
        model,
        X_train_trans,
        y_train,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )

    mean_score = -cv_score.mean()

    return mean_score

In [32]:
# Create Optuna study
study = optuna.create_study(direction="minimize")

study.optimize(
    objective,
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True
)

print("Best Params:", study.best_params)
print("Best CV MAE:", study.best_value)


# Train final best model
best_xgb = XGBRegressor(
    **study.best_params,
    random_state=42,
    n_jobs=-1,
    objective="reg:squarederror",
    tree_method="hist"
)

model = TransformedTargetRegressor(
    regressor=best_xgb,
    transformer=pt
)

model.fit(X_train_trans, y_train)

# Predictions
y_pred_train = model.predict(X_train_trans)
y_pred_test = model.predict(X_test_trans)

# Metrics
print("Training MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Training R2:", r2_score(y_train, y_pred_train))
print("Test R2:", r2_score(y_test, y_pred_test))

# Cross-validation on best model
scores = cross_val_score(
    model,
    X_train_trans,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

print("Final CV MAE:", -scores.mean())

[I 2026-02-11 13:34:06,217] A new study created in memory with name: no-name-cf401d1d-c59f-4a79-ad5a-aee69eb0a88a


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-11 13:34:35,312] Trial 0 finished with value: 3.128642463684082 and parameters: {'n_estimators': 785, 'max_depth': 6, 'learning_rate': 0.23696640461562973, 'subsample': 0.9273595263871525, 'colsample_bytree': 0.7746053179487937, 'reg_alpha': 8.675395548651343, 'reg_lambda': 9.009345385451155}. Best is trial 0 with value: 3.128642463684082.
[I 2026-02-11 13:34:50,985] Trial 1 finished with value: 3.4050153732299804 and parameters: {'n_estimators': 1409, 'max_depth': 3, 'learning_rate': 0.023687324784065555, 'subsample': 0.966801923221785, 'colsample_bytree': 0.8105306250297467, 'reg_alpha': 8.26577036222676, 'reg_lambda': 1.7417127446758547}. Best is trial 0 with value: 3.128642463684082.
[I 2026-02-11 13:35:21,284] Trial 2 finished with value: 3.23438138961792 and parameters: {'n_estimators': 1420, 'max_depth': 9, 'learning_rate': 0.2275059140181324, 'subsample': 0.5401378746190473, 'colsample_bytree': 0.5307865144348058, 'reg_alpha': 8.46692866620971, 'reg_lambda': 7.866943



[I 2026-02-11 13:38:19,297] Trial 10 finished with value: 3.1205246925354 and parameters: {'n_estimators': 1056, 'max_depth': 12, 'learning_rate': 0.04482385919420748, 'subsample': 0.8776186255260466, 'colsample_bytree': 0.9886885271343206, 'reg_alpha': 3.511999765288377, 'reg_lambda': 6.316319477570685}. Best is trial 3 with value: 3.077672243118286.
[I 2026-02-11 13:38:35,284] Trial 11 finished with value: 3.0865196228027343 and parameters: {'n_estimators': 678, 'max_depth': 7, 'learning_rate': 0.011230205249449817, 'subsample': 0.7592876326396052, 'colsample_bytree': 0.9352817213559474, 'reg_alpha': 0.8730174835141729, 'reg_lambda': 6.999881211389126}. Best is trial 3 with value: 3.077672243118286.
[I 2026-02-11 13:38:52,413] Trial 12 finished with value: 3.0605481624603272 and parameters: {'n_estimators': 616, 'max_depth': 8, 'learning_rate': 0.010654894087325532, 'subsample': 0.8061238165489055, 'colsample_bytree': 0.9542373520794004, 'reg_alpha': 2.4709232108365535, 'reg_lambda':

In [33]:
import optuna
from lightgbm import LGBMRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor

In [34]:
def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "random_state": 42,
        "n_jobs": -1
    }

    lgb_model = LGBMRegressor(**params)

    model = TransformedTargetRegressor(
        regressor=lgb_model,
        transformer=pt
    )

    cv_score = cross_val_score(
        model,
        X_train_trans,
        y_train,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )

    mean_score = -cv_score.mean()

    return mean_score

In [35]:
# Create study
study = optuna.create_study(direction="minimize")

study.optimize(
    objective,
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True
)

print("Best Params:", study.best_params)
print("Best CV MAE:", study.best_value)


# Train final best model
best_lgb = LGBMRegressor(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)

model = TransformedTargetRegressor(
    regressor=best_lgb,
    transformer=pt
)

model.fit(X_train_trans, y_train)

# Predictions
y_pred_train = model.predict(X_train_trans)
y_pred_test = model.predict(X_test_trans)

# Metrics
print("Training MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Training R2:", r2_score(y_train, y_pred_train))
print("Test R2:", r2_score(y_test, y_pred_test))

# Cross-validation on best model
scores = cross_val_score(
    model,
    X_train_trans,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

print("Final CV MAE:", -scores.mean())

[I 2026-02-11 13:50:14,051] A new study created in memory with name: no-name-eff1bed2-324d-45f0-9146-2ae6333ee8ce


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-11 13:50:28,871] Trial 0 finished with value: 3.120340928318779 and parameters: {'n_estimators': 333, 'max_depth': 11, 'learning_rate': 0.26333504022022197, 'num_leaves': 33, 'min_child_samples': 46, 'subsample': 0.5647872940690801, 'colsample_bytree': 0.9382100228263095, 'reg_alpha': 1.1221320476012453, 'reg_lambda': 9.850839677974335}. Best is trial 0 with value: 3.120340928318779.
[I 2026-02-11 13:51:00,671] Trial 1 finished with value: 3.10942064319236 and parameters: {'n_estimators': 398, 'max_depth': 12, 'learning_rate': 0.028650704989813394, 'num_leaves': 124, 'min_child_samples': 34, 'subsample': 0.9548787421838402, 'colsample_bytree': 0.6730703017377566, 'reg_alpha': 1.4058714699871822, 'reg_lambda': 2.0635700048469685}. Best is trial 1 with value: 3.10942064319236.
[I 2026-02-11 13:51:16,341] Trial 2 finished with value: 3.1822044521455095 and parameters: {'n_estimators': 717, 'max_depth': 7, 'learning_rate': 0.23939046866154848, 'num_leaves': 196, 'min_child_sampl

In [48]:
study.best_params

{'n_estimators': 959,
 'max_depth': 10,
 'learning_rate': 0.015908986718229125,
 'num_leaves': 221,
 'min_child_samples': 44,
 'subsample': 0.7916185448024788,
 'colsample_bytree': 0.9132622220994379,
 'reg_alpha': 0.08775694417084225,
 'reg_lambda': 4.563333600351618}

In [50]:
study.best_value

3.046137423331966

In [51]:
# optimization history plot

optuna.visualization.plot_optimization_history(study)

In [52]:
# plot hyperparameter importance plot

optuna.visualization.plot_param_importances(study)

In [53]:
# slice plot

optuna.visualization.plot_slice(study)

In [20]:
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import optuna

In [21]:
# -------------------- Optuna Objective --------------------

def objective(trial):

    params = {
        "iterations": trial.suggest_int("iterations", 500, 2000),
        "depth": trial.suggest_int("depth", 4, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 50),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "loss_function": "MAE",
        "eval_metric": "MAE",
        "random_seed": 42,
        "verbose": 0,
        "task_type": "CPU"
    }

    cat_model = CatBoostRegressor(**params)

    model = TransformedTargetRegressor(
        regressor=cat_model,
        transformer=pt
    )

    cv_scores = cross_val_score(
        model,
        X_train_trans,
        y_train,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )

    return -cv_scores.mean()

In [22]:
# -------------------- Run Study (50 Trials) --------------------

study = optuna.create_study(direction="minimize")

study.optimize(
    objective,
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True
)

print("\nBest Parameters Found:")
print(study.best_params)
print("Best CV MAE:", study.best_value)


# -------------------- Train Final Best Model --------------------

best_cat = CatBoostRegressor(
    **study.best_params,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    verbose=0,
    task_type="CPU"
)

final_model = TransformedTargetRegressor(
    regressor=best_cat,
    transformer=pt
)

final_model.fit(X_train_trans, y_train)

[I 2026-02-11 08:59:52,369] A new study created in memory with name: no-name-665eadb5-5463-4315-b30b-8a106ab4ed64


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-11 09:02:03,697] Trial 0 finished with value: 3.1500100978417866 and parameters: {'iterations': 1704, 'depth': 6, 'learning_rate': 0.08204401768504865, 'l2_leaf_reg': 40.5760115876046, 'random_strength': 5.073364296954078, 'bagging_temperature': 0.3407445422994696}. Best is trial 0 with value: 3.1500100978417866.
[I 2026-02-11 09:03:31,968] Trial 1 finished with value: 3.1844243164088057 and parameters: {'iterations': 1479, 'depth': 6, 'learning_rate': 0.01534356585539872, 'l2_leaf_reg': 7.212874543251011, 'random_strength': 1.769176653122273, 'bagging_temperature': 0.9440339815421562}. Best is trial 0 with value: 3.1500100978417866.
[I 2026-02-11 09:04:31,175] Trial 2 finished with value: 3.142617539597706 and parameters: {'iterations': 805, 'depth': 6, 'learning_rate': 0.08985834476096127, 'l2_leaf_reg': 13.497114233749079, 'random_strength': 1.4618813781968032, 'bagging_temperature': 0.30847583599346773}. Best is trial 2 with value: 3.142617539597706.




[I 2026-02-11 09:13:30,022] Trial 3 finished with value: 3.164670453720642 and parameters: {'iterations': 1521, 'depth': 11, 'learning_rate': 0.07054690488227315, 'l2_leaf_reg': 9.990630943893624, 'random_strength': 0.5462185381795859, 'bagging_temperature': 0.8687207684412462}. Best is trial 2 with value: 3.142617539597706.
[I 2026-02-11 09:15:26,662] Trial 4 finished with value: 3.247330973510993 and parameters: {'iterations': 1979, 'depth': 5, 'learning_rate': 0.0177769775364222, 'l2_leaf_reg': 37.968064458618, 'random_strength': 6.952490633446356, 'bagging_temperature': 0.8176900243468176}. Best is trial 2 with value: 3.142617539597706.
[I 2026-02-11 09:17:07,348] Trial 5 finished with value: 3.157044754022029 and parameters: {'iterations': 1115, 'depth': 6, 'learning_rate': 0.08844358196067292, 'l2_leaf_reg': 37.23885324690029, 'random_strength': 8.245059849601617, 'bagging_temperature': 0.8619818600191216}. Best is trial 2 with value: 3.142617539597706.
[I 2026-02-11 09:19:18,628

In [23]:
# -------------------- Predictions --------------------

y_pred_train = final_model.predict(X_train_trans)
y_pred_test = final_model.predict(X_test_trans)


# -------------------- Metrics --------------------

train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\nFinal Evaluation")
print("Train MAE:", train_mae)
print("Test MAE:", test_mae)
print("Train R2:", train_r2)
print("Test R2:", test_r2)


# -------------------- Cross Validation on Final Model --------------------

cv_scores = cross_val_score(
    final_model,
    X_train_trans,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

print("Final 5-Fold CV MAE:", -cv_scores.mean())


Final Evaluation
Train MAE: 2.2657174673003624
Test MAE: 3.0321254544007674
Train R2: 0.8925214236214369
Test R2: 0.835077352055125
Final 5-Fold CV MAE: 3.017631423827768
