In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib


In [12]:
import pandas as pd
df = pd.read_csv(r"C:\Users\sumanth\OneDrive\Documents\swiggy_demographic (2).csv")
df.head()


Unnamed: 0,rider_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,weather,traffic,...,time_taken,city_name,order_day,order_month,order_day_of_week,is_weekend,pickup_time_minutes,order_time_hour,order_time_of_day,distance
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,sunny,high,...,24,INDO,19,3,saturday,1,15.0,11.0,morning,3.025149
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,stormy,jam,...,33,BANG,25,3,friday,0,5.0,19.0,evening,20.18353
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,sandstorms,low,...,26,BANG,19,3,saturday,1,15.0,8.0,morning,1.552758
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,sunny,medium,...,21,COIMB,5,4,tuesday,0,10.0,18.0,evening,7.790401
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,cloudy,high,...,30,CHEN,26,3,saturday,1,15.0,13.0,afternoon,6.210138


In [13]:
df.isnull().sum()


rider_id                   0
age                     1854
ratings                 1908
restaurant_latitude     3630
restaurant_longitude    3630
delivery_latitude       3630
delivery_longitude      3630
order_date                 0
weather                  525
traffic                  510
vehicle_condition          0
type_of_order              0
type_of_vehicle            0
multiple_deliveries      993
festival                 228
city_type               1198
time_taken                 0
city_name                  0
order_day                  0
order_month                0
order_day_of_week          0
is_weekend                 0
pickup_time_minutes     1640
order_time_hour         1640
order_time_of_day          0
distance                3630
dtype: int64

In [14]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
)


In [15]:
df["festival"] = (
    df["festival"]
    .str.lower()
    .map({"yes": 1, "no": 0})
    .fillna(0)
)


In [16]:
df["is_peak_hour"] = (
    df["order_time_hour"].between(12, 14) |
    df["order_time_hour"].between(19, 22)
).astype(int)


In [17]:
df["delivery_load"] = df["multiple_deliveries"] * df["distance"]


In [18]:
df["high_demand_day"] = df["is_weekend"] * df["festival"]


In [19]:
# ❌ WRONG — uses target variable
df["avg_speed_kmph"] = df["distance"] / (df["time_taken"] / 60)


In [20]:
print(df[[
    "distance",
    "time_taken",
    "festival",
    "is_weekend",
    "is_peak_hour",
    "delivery_load",
    "high_demand_day"
]].head())


    distance  time_taken  festival  is_weekend  is_peak_hour  delivery_load  \
0   3.025149          24       0.0           1             0       0.000000   
1  20.183530          33       0.0           0             1      20.183530   
2   1.552758          26       0.0           1             0       1.552758   
3   7.790401          21       0.0           0             0       7.790401   
4   6.210138          30       0.0           1             1       6.210138   

   high_demand_day  
0              0.0  
1              0.0  
2              0.0  
3              0.0  
4              0.0  


In [21]:
df["is_weekend"] = df["is_weekend"].fillna(0).astype(int)


In [22]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


In [23]:
cat_cols = df.select_dtypes(include="object").columns
df[cat_cols] = df[cat_cols].fillna("unknown")


In [24]:
df["is_peak_hour"] = (
    df["order_time_hour"].between(12, 14) |
    df["order_time_hour"].between(19, 22)
).astype(int)


In [25]:
df["delivery_load"] = df["multiple_deliveries"] * df["distance"]


In [26]:
df["high_demand_day"] = df["is_weekend"] * df["festival"]


In [27]:
X = df.drop("time_taken", axis=1)
y = df["time_taken"]


In [28]:
num_features = X.select_dtypes(include=np.number).columns
cat_features = X.select_dtypes(include="object").columns


In [29]:
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [30]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [32]:
lr_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Linear Regression")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("R2 :", r2_score(y_test, y_pred_lr))


Linear Regression
RMSE: 4.2387190491681
MAE: 3.217448348996797
R2 : 0.7957420314867908


In [33]:
rf_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R2 :", r2_score(y_test, y_pred_rf))
     

Random Forest
RMSE: 1.7801120606087029
MAE: 0.7811328547262476
R2 : 0.9639749634362129


In [34]:
gbr_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])

gbr_model.fit(X_train, y_train)
y_pred_gbr = gbr_model.predict(X_test)

print("Gradient Boosting")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gbr)))
print("MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("R2 :", r2_score(y_test, y_pred_gbr))


Gradient Boosting
RMSE: 1.9307426852231737
MAE: 1.1246240590572192
R2 : 0.9576202352784959


In [None]:
param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1]
}

grid = GridSearchCV(
    gbr_model,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_


In [None]:
y_pred_best = best_model.predict(X_test)

print("FINAL MODEL")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_best)))
print("MAE:", mean_absolute_error(y_test, y_pred_best))
print("R2 :", r2_score(y_test, y_pred_best))


FINAL MODEL
RMSE: 1.7299284770653356
MAE: 0.8791368557146113
R2 : 0.9659775142627701


In [None]:
joblib.dump(best_model, "swiggy_eta_model.pkl")


['swiggy_eta_model.pkl']

In [None]:
!pip install streamlit pandas scikit-learn joblib


