In [118]:
# !pip install mlflow dagshub optuna

In [119]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
import mlflow
import dagshub
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [120]:
dagshub.init(repo_owner='Aryanupadhyay23', repo_name='Zomato-Food-Delivery-Time-prediction', mlflow=True)

In [121]:
# set the tracking uri

mlflow.set_tracking_uri("https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow")

In [122]:
# mlflow experiment

mlflow.set_experiment("RandomForest HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/62302e1a83834c1ab05979f93f253661', creation_time=1770614035653, experiment_id='5', last_update_time=1770614035653, lifecycle_stage='active', name='RandomForest HP Tuning', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [123]:
df = pd.read_csv("/content/food_delivery_interim.csv")

In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38055 entries, 0 to 38054
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   rider_age            38055 non-null  float64
 1   rider_ratings        38055 non-null  float64
 2   weather              38055 non-null  object 
 3   traffic_density      38055 non-null  object 
 4   vehicle_condition    38055 non-null  int64  
 5   order_type           38055 non-null  object 
 6   vehicle_type         38055 non-null  object 
 7   multiple_deliveries  38055 non-null  float64
 8   festival             38055 non-null  object 
 9   time_taken           38055 non-null  int64  
 10  city_type            38055 non-null  object 
 11  day_name             38055 non-null  object 
 12  time_of_day          38055 non-null  object 
 13  distance             38055 non-null  float64
dtypes: float64(4), int64(2), object(8)
memory usage: 4.1+ MB


In [125]:
df.head()

Unnamed: 0,rider_age,rider_ratings,weather,traffic_density,vehicle_condition,order_type,vehicle_type,multiple_deliveries,festival,time_taken,city_type,day_name,time_of_day,distance
0,36.0,4.2,fog,jam,2,snack,motorcycle,3.0,no,46,metropolitian,saturday,dinner_peak,10.280582
1,21.0,4.7,stormy,high,1,meal,motorcycle,1.0,no,23,metropolitian,sunday,afternoon,6.242319
2,23.0,4.7,sandstorms,medium,1,drinks,scooter,1.0,no,21,metropolitian,friday,evening_snacks,13.78786
3,34.0,4.3,sandstorms,low,0,buffet,motorcycle,0.0,no,20,metropolitian,sunday,breakfast,2.930258
4,24.0,4.7,fog,jam,1,snack,scooter,1.0,no,41,metropolitian,monday,evening_snacks,19.396618


In [126]:
X = df.drop(columns='time_taken')
y = df['time_taken']

In [127]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [128]:
num_cols = ["rider_age","rider_ratings","distance"]

nominal_cat_cols = ["weather","order_type","vehicle_type","festival","city_type","day_name","time_of_day"]

ordinal_cat_cols = ["traffic_density"]

In [129]:
traffic_order = ["low","medium","high","jam"]

In [130]:
preprocessor = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), num_cols),
        (
            "nominal_encoder",
            OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
            nominal_cat_cols
        ),
        (
            "ordinal_encoder",
            OrdinalEncoder(categories=[traffic_order]),
            ordinal_cat_cols
        )
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [131]:
pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [132]:
y_train_pt = np.ravel(y_train_pt)
y_test_pt  = np.ravel(y_test_pt)

In [133]:
## pipeline

preprocessing_pipeline = Pipeline(
    steps=[
        ("preprocessor",preprocessor)
    ]
)

preprocessing_pipeline

In [134]:
X_train_trans = preprocessing_pipeline.fit_transform(X_train)
X_test_trans = preprocessing_pipeline.transform(X_test)

X_train_trans

Unnamed: 0,rider_age,rider_ratings,distance,weather_fog,weather_sandstorms,weather_stormy,weather_sunny,weather_windy,order_type_drinks,order_type_meal,...,day_name_tuesday,day_name_wednesday,time_of_day_breakfast,time_of_day_dinner_peak,time_of_day_evening_snacks,time_of_day_late_night,time_of_day_lunch_peak,traffic_density,vehicle_condition,multiple_deliveries
6965,-0.282097,1.164633,-1.211247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,0.0
14052,1.454428,0.849370,0.717574,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0,3.0
25717,-0.455749,0.218843,0.144911,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2,1.0
35085,1.280776,-1.988001,-0.385292,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2,1.0
5921,0.759818,-0.726947,0.689886,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,-1.671317,1.164633,-1.203960,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1.0
6265,-0.976707,-1.672737,-0.932219,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
11284,-0.976707,-1.672737,0.653365,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0,1.0
860,1.454428,-1.357474,0.757606,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2,2.0


In [135]:
def objective(trial):
    with mlflow.start_run(nested=True):

        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 500),
            "max_depth": trial.suggest_int("max_depth", 1, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical(
                "max_features", ["sqrt", "log2", None]
            ),
            "bootstrap": trial.suggest_categorical(
                "bootstrap", [True, False]
            ),
            "random_state": 42,
            "n_jobs": -1
        }

        # log parameters
        mlflow.log_params(params)

        rf_reg = RandomForestRegressor(**params)

        model = TransformedTargetRegressor(
            regressor=rf_reg,
            transformer=pt
        )

        # fit model
        model.fit(X_train_trans, y_train)

        # cross validation
        cv_score = cross_val_score(
            model,
            X_train_trans,
            y_train,
            cv=5,
            scoring="neg_mean_absolute_error",
            n_jobs=-1
        )

        mean_score = -cv_score.mean()

        # log metric
        mlflow.log_metric("cross_val_mae", mean_score)

        return mean_score


In [136]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):

    # optimize
    study.optimize(
        objective,
        n_trials=20,
        n_jobs=1,
        show_progress_bar=True
    )

    # log best params and score
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_score", study.best_value)

    # build best Random Forest model
    best_rf = RandomForestRegressor(
        **study.best_params,
        random_state=42,
        n_jobs=-1
    )

    # wrap with TransformedTargetRegressor
    model = TransformedTargetRegressor(
        regressor=best_rf,
        transformer=pt
    )

    # fit model
    model.fit(X_train_trans, y_train)

    # predictions (already in original scale)
    y_pred_train = model.predict(X_train_trans)
    y_pred_test = model.predict(X_test_trans)

    # cross validation
    scores = cross_val_score(
        model,
        X_train_trans,
        y_train,
        scoring="neg_mean_absolute_error",
        cv=5,
        n_jobs=-1
    )

    # log metrics
    mlflow.log_metric(
        "training_mae",
        mean_absolute_error(y_train, y_pred_train)
    )
    mlflow.log_metric(
        "test_mae",
        mean_absolute_error(y_test, y_pred_test)
    )
    mlflow.log_metric(
        "training_r2",
        r2_score(y_train, y_pred_train)
    )
    mlflow.log_metric(
        "test_r2",
        r2_score(y_test, y_pred_test)
    )
    mlflow.log_metric(
        "cross_val_mae",
        -scores.mean()
    )

    # log full pipeline model
    mlflow.sklearn.log_model(
        model,
        artifact_path="model"
    )

[I 2026-02-09 07:25:14,753] A new study created in memory with name: no-name-21292bf2-0042-4073-9398-916d1f40f15c


  0%|          | 0/20 [00:00<?, ?it/s]

🏃 View run nimble-rook-200 at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5/runs/c1fe448cb0c34f688130bb47d2f97372
🧪 View experiment at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5
[I 2026-02-09 07:26:13,107] Trial 0 finished with value: 3.2936952914192856 and parameters: {'n_estimators': 305, 'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 3.2936952914192856.
🏃 View run capricious-jay-996 at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5/runs/4bb0d6ae7ba14f03a51f9d4bdcc993a2
🧪 View experiment at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5
[I 2026-02-09 07:26:34,426] Trial 1 finished with value: 3.201470913938111 and parameters: {'n_estimators': 84, 'max_depth': 20, 'min_samples_split': 3, 'min_sa



🏃 View run righteous-gnat-606 at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5/runs/e556ea25ed3b441bac20b3cf8032b171
🧪 View experiment at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5
[I 2026-02-09 07:34:04,108] Trial 5 finished with value: 3.077060811707535 and parameters: {'n_estimators': 460, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': True}. Best is trial 5 with value: 3.077060811707535.
🏃 View run bittersweet-kite-142 at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5/runs/ac8d06db1973446bbd795f8a9fc64169
🧪 View experiment at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5
[I 2026-02-09 07:34:13,943] Trial 6 finished with value: 3.316374408695778 and parameters: {'n_estimators': 63, 'max_depth': 18, 'min_samples_split': 7, 'min_sa

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


🏃 View run best_model at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5/runs/4fab3f9f8e064da6be4482e02ee3a6f4
🧪 View experiment at: https://dagshub.com/Aryanupadhyay23/Zomato-Food-Delivery-Time-prediction.mlflow/#/experiments/5


In [137]:
study.best_params

{'n_estimators': 290,
 'max_depth': 14,
 'min_samples_split': 7,
 'min_samples_leaf': 1,
 'max_features': None,
 'bootstrap': True}

In [138]:
study.best_value

3.069350252595574

In [139]:
# optimization history plot

optuna.visualization.plot_optimization_history(study)

In [140]:
# plot hyperparameter importance plot

optuna.visualization.plot_param_importances(study)

In [141]:
# slice plot

optuna.visualization.plot_slice(study)