# Importing Libraries

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder


import warnings
warnings.filterwarnings('ignore')

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
%cd /content/drive/MyDrive/mlproject

/content/drive/MyDrive/mlproject


# Load Data

In [35]:
data = pd.read_csv("preprocessed_data.csv")

In [36]:
data.head()

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken,distance(km),is_weekend,pickup_time_minutes,order_period
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,3.03,1,15.0,morning
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,20.18,0,5.0,evening
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1.55,1,15.0,morning
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,7.79,0,10.0,evening
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,6.21,1,15.0,afternoon


In [37]:
preprocessor = joblib.load("preprocessor.joblib")

In [38]:
data.isnull().sum()

Unnamed: 0,0
age,1854
ratings,1908
weather,525
traffic,510
vehicle_condition,0
type_of_order,0
type_of_vehicle,0
multiple_deliveries,993
festival,228
city,1198


# Imputation

**Split of Data**

In [39]:
X = data.drop("time_taken", axis=1)
y = data["time_taken"]

In [40]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [41]:
## Check for same columns in data and preprocessor

expected_cols = set(preprocessor.feature_names_in_)
incoming_cols = set(x_train.columns)

if expected_cols != incoming_cols:
    raise ValueError(
        f"Schema mismatch:\nMissing: {expected_cols - incoming_cols}\n"
        f"Extra: {incoming_cols - expected_cols}"
    )


**Transforming target feature**

In [42]:
y_train_org = y_train.values.reshape(-1, 1)
y_test_org = y_test.values.reshape(-1, 1)

In [44]:
pt = PowerTransformer()

y_train = pt.fit_transform(y_train_org)
y_test = pt.transform(y_test_org)

**Transforming Independendent features**

In [45]:
x_train = preprocessor.transform(x_train)
x_test = preprocessor.transform(x_test)

In [46]:
## Double check for missing values
np.isnan(x_train).sum()

np.int64(0)

In [47]:
!pip install mlflow
!pip install dagshub



# dagshub setup

In [48]:
import dagshub
import mlflow
dagshub.init(repo_owner='Ankitkumar1141', repo_name='Swiggy_delivery_time_prediction', mlflow=True)

In [49]:
mlflow.set_tracking_uri("https://dagshub.com/Ankitkumar1141/Swiggy_delivery_time_prediction.mlflow")

In [50]:
mlflow.set_experiment("Experiment 1: Baseline Model")

<Experiment: artifact_location='mlflow-artifacts:/923fc2cb32384308a10bbe402128e484', creation_time=1769192837209, experiment_id='0', last_update_time=1769192837209, lifecycle_stage='active', name='Experiment 1: Baseline Model', tags={}>

# Modelling

In [51]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [52]:
model.fit(x_train,y_train.ravel())

In [53]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

In [54]:
y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))

In [55]:
from sklearn.metrics import mean_absolute_error, r2_score

print("Train MAE:", mean_absolute_error(y_train_org, y_pred_train_org))
print("Test MAE:", mean_absolute_error(y_test_org, y_pred_test_org))

print("Train R2:", r2_score(y_train_org, y_pred_train_org))
print("Test R2:", r2_score(y_test_org, y_pred_test_org))


Train MAE: 1.2275650762043042
Test MAE: 3.316582744555299
Train R2: 0.9709622928073894
Test R2: 0.7985094531682438


In [56]:
# calculate the cross val score

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

scores = cross_val_score(
    pipe,
    X,
    y,
    cv=5,
    scoring="r2",
    n_jobs=-1
)
scores

array([0.80377893, 0.80361156, 0.80400848, 0.78681074, 0.80061612])

In [57]:
scores.mean()

np.float64(0.799765167743126)

# Experiment tracking

In [59]:
# log experiment
with mlflow.start_run(run_name="Baseline"):
    # log experiment type
    mlflow.log_param("experiment_type", "Baseline")

    # log model params
    mlflow.log_params(model.get_params())

    # log metrics (IMPORTANT: use original-scale y)
    mlflow.log_metric(
        "training_error",
        mean_absolute_error(y_train_org, y_pred_train_org)
    )

    mlflow.log_metric(
        "test_error",
        mean_absolute_error(y_test_org, y_pred_test_org)
    )

    mlflow.log_metric(
        "training_r2",
        r2_score(y_train_org, y_pred_train_org)
    )

    mlflow.log_metric(
        "test_r2",
        r2_score(y_test_org, y_pred_test_org)
    )

    mlflow.log_metric(
        "cross_val_r2",
        scores.mean()
    )


üèÉ View run Baseline at: https://dagshub.com/Ankitkumar1141/Swiggy_delivery_time_prediction.mlflow/#/experiments/0/runs/02d27c4d6f444c0098333b05e7e6beb4
üß™ View experiment at: https://dagshub.com/Ankitkumar1141/Swiggy_delivery_time_prediction.mlflow/#/experiments/0
