In [1]:
!pip install geopy mlflow scikit-learn xgboost




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

import mlflow
import mlflow.sklearn


In [3]:
df = pd.read_csv('/content/amazon_delivery.csv')
df.head()


Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


In [4]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Convert datetime fields
df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Order_Time'] = pd.to_datetime(df['Order_Time'], format='%H:%M:%S', errors='coerce').dt.time
df['Pickup_Time'] = pd.to_datetime(df['Pickup_Time'], format='%H:%M:%S', errors='coerce').dt.time

# Drop rows with missing critical values
df.dropna(subset=['Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude'], inplace=True)

# Fill categorical NaNs with mode
for col in ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Fill numerical NaNs with median
df['Agent_Age'].fillna(df['Agent_Age'].median(), inplace=True)
df['Agent_Rating'].fillna(df['Agent_Rating'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Agent_Age'].fillna(df['Agent_Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [5]:
df.head()


Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


In [6]:
# Distance calculation
def calc_distance(row):
    store_coords = (row['Store_Latitude'], row['Store_Longitude'])
    drop_coords = (row['Drop_Latitude'], row['Drop_Longitude'])
    return geodesic(store_coords, drop_coords).km

df['Distance_km'] = df.apply(calc_distance, axis=1)

# Combine order date and time
df['Order_DateTime'] = pd.to_datetime(df['Order_Date'].astype(str) + ' ' + df['Order_Time'].astype(str), errors='coerce')
df['Hour'] = df['Order_DateTime'].dt.hour
df['DayOfWeek'] = df['Order_DateTime'].dt.dayofweek


In [7]:
label_cols = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


In [8]:
features = ['Agent_Age', 'Agent_Rating', 'Distance_km', 'Weather', 'Traffic',
            'Vehicle', 'Area', 'Category', 'Hour', 'DayOfWeek']
target = 'Delivery_Time'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'{model_name} Results:')
    print(f'R² Score: {r2:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'RMSE: {rmse:.4f}')

    return model, r2, mae, rmse


In [11]:
# Fill any remaining NaNs in features with median values (for both train and test)
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


In [12]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    trained_model, r2, mae, rmse = train_and_evaluate(model, name)
    results[name] = {'R2': r2, 'MAE': mae, 'RMSE': rmse}


Linear Regression Results:
R² Score: 0.2389
MAE: 34.7900
RMSE: 45.0282
Random Forest Results:
R² Score: 0.8011
MAE: 17.7784
RMSE: 23.0186
Gradient Boosting Results:
R² Score: 0.7782
MAE: 19.1739
RMSE: 24.3091
XGBoost Results:
R² Score: 0.8109
MAE: 17.4886
RMSE: 22.4471


In [13]:
results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,R2,MAE,RMSE
Linear Regression,0.238895,34.789984,45.028173
Random Forest,0.801101,17.778387,23.018563
Gradient Boosting,0.778174,19.173928,24.309087
XGBoost,0.810853,17.488585,22.44715


In [14]:
sample_input = X_test.iloc[0:1]
sample_prediction = models['Random Forest'].predict(sample_input)
print("Predicted Delivery Time (hours):", sample_prediction[0])


Predicted Delivery Time (hours): 106.8


In [15]:
mlflow.set_experiment("amazon_delivery_prediction")

with mlflow.start_run():
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_metric("rmse", np.sqrt(mean_squared_error(y_test, pred)))
    mlflow.log_metric("r2", r2_score(y_test, pred))
    mlflow.sklearn.log_model(model, "random_forest_model")


2025/06/01 10:46:01 INFO mlflow.tracking.fluent: Experiment with name 'amazon_delivery_prediction' does not exist. Creating a new experiment.


In [16]:
import ipywidgets as widgets
from IPython.display import display

# Create widgets for each input
agent_age = widgets.FloatText(description="Agent Age:", value=30)
agent_rating = widgets.FloatSlider(description="Rating:", min=1.0, max=5.0, step=0.1, value=4.5)
distance_km = widgets.FloatText(description="Distance (km):", value=5.0)

weather = widgets.Dropdown(description="Weather:", options=[(str(i), i) for i in sorted(df['Weather'].unique())])
traffic = widgets.Dropdown(description="Traffic:", options=[(str(i), i) for i in sorted(df['Traffic'].unique())])
vehicle = widgets.Dropdown(description="Vehicle:", options=[(str(i), i) for i in sorted(df['Vehicle'].unique())])
area = widgets.Dropdown(description="Area:", options=[(str(i), i) for i in sorted(df['Area'].unique())])
category = widgets.Dropdown(description="Category:", options=[(str(i), i) for i in sorted(df['Category'].unique())])

hour = widgets.IntSlider(description="Hour:", min=0, max=23, value=14)
day_of_week = widgets.IntSlider(description="Day:", min=0, max=6, value=2)

run_button = widgets.Button(description="Predict")

# Layout all inputs
input_widgets = widgets.VBox([
    agent_age, agent_rating, distance_km,
    weather, traffic, vehicle, area, category,
    hour, day_of_week, run_button
])
display(input_widgets)

# Handler for prediction
def on_button_clicked(b):
    user_input = np.array([[agent_age.value,
                            agent_rating.value,
                            distance_km.value,
                            weather.value,
                            traffic.value,
                            vehicle.value,
                            area.value,
                            category.value,
                            hour.value,
                            day_of_week.value]])

    print("\n📦 Delivery Time Predictions (in hours):")
    for name, model in models.items():
        try:
            pred = model.predict(user_input)[0]
            print(f"{name}: {pred:.2f} hours")
        except Exception as e:
            print(f"{name}: Error - {e}")

# Link button click to prediction
run_button.on_click(on_button_clicked)


VBox(children=(FloatText(value=30.0, description='Agent Age:'), FloatSlider(value=4.5, description='Rating:', …


📦 Delivery Time Predictions (in hours):
Linear Regression: 143.66 hours
Random Forest: 180.40 hours
Gradient Boosting: 184.99 hours
XGBoost: 183.77 hours


