In [1]:
import pandas as pd

In [2]:
import numpy as np
from geopy.distance import geodesic

In [4]:
df=pd.read_csv('Dataset.csv')

In [5]:
df['Order_Time']=df['Order_Time'].astype(str).str.strip().replace('NaN', '00:00:00')
df['Pickup_Time']=df['Pickup_Time'].astype(str).str.strip().replace('NaN', '00:00:00')

In [None]:

#time order was placed and picked
df['Order_Hour'] = pd.to_datetime(df['Order_Time'], format='%H:%M:%S', errors='coerce').dt.hour
df['Pickup_Hour'] = pd.to_datetime(df['Pickup_Time'], format='%H:%M:%S', errors='coerce').dt.hour

df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Weekday'] = df['Order_Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday

#Calculate Distance Using Geopy
def calculate_distance(row):
    store = (row['Store_Latitude'], row['Store_Longitude'])
    drop = (row['Drop_Latitude'], row['Drop_Longitude'])
    return geodesic(store, drop).km

df['Distance_km'] = df.apply(calculate_distance, axis=1)

#Delivery Speed
df['Delivery_Speed'] = df['Distance_km'] / (df['Delivery_Time'] + 1e-6)

#Delay Flag: 1 if delivery took > 120 mins
df['Delay_Flag'] = df['Delivery_Time'].apply(lambda x: 1 if x > 120 else 0)

# Time of Day
def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['Time_of_Day'] = df['Order_Hour'].apply(get_time_of_day)

df['Agent_Efficiency'] = df['Agent_Rating'] / (df['Delivery_Time'] + 1e-6)

# Weather & Traffic Mapping
weather_map = {'Sunny': 0, 'Cloudy': 1, 'Stormy': 3, 'Sandstorms': 4}
traffic_map = {'Low': 0, 'Medium': 1, 'High': 2, 'Jam': 3}

df['Weather_Impact'] = df['Weather'].map(weather_map).fillna(0)
df['Traffic_Level'] = df['Traffic'].map(traffic_map).fillna(0)

# Final Feature Columns to use
feature_cols = [
    'Distance_km', 'Delivery_Speed', 'Agent_Rating', 'Agent_Efficiency',
    'Traffic_Level', 'Weather_Impact', 'Weekday', 'Time_of_Day'
]

df[feature_cols + ['Delay_Flag']].head()


Unnamed: 0,Distance_km,Delivery_Speed,Agent_Rating,Agent_Efficiency,Traffic_Level,Weather_Impact,Weekday,Time_of_Day,Delay_Flag
0,3.020737,0.025173,4.9,0.040833,0.0,0.0,5,Morning,0
1,20.143737,0.122083,4.5,0.027273,0.0,3.0,4,Evening,1
2,1.549693,0.011921,4.4,0.033846,0.0,4.0,5,Morning,1
3,7.774497,0.074043,4.7,0.044762,0.0,0.0,1,Evening,0
4,6.197898,0.041319,4.6,0.030667,0.0,1.0,5,Afternoon,1


In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
df['Time_of_Day_Encoded'] = le.fit_transform(df['Time_of_Day'])

In [12]:
dict(zip(le.classes_, le.transform(le.classes_)))

{'Afternoon': 0, 'Evening': 1, 'Morning': 2, 'Night': 3}

In [17]:
df.to_csv('final_dataset.csv', index=False)

In [19]:
from sklearn.model_selection import train_test_split

X = df[['Distance_km', 'Delivery_Speed', 'Agent_Rating', 'Agent_Efficiency',
        'Traffic_Level', 'Weather_Impact', 'Weekday', 'Time_of_Day_Encoded']]  # features

y = df['Delay_Flag']  # target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9989711934156379
Precision: 0.9991093297706524
Recall: 0.998886910062333
F1 Score: 0.9989981075364578

Confusion Matrix:
 [[4252    4]
 [   5 4487]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4256
           1       1.00      1.00      1.00      4492

    accuracy                           1.00      8748
   macro avg       1.00      1.00      1.00      8748
weighted avg       1.00      1.00      1.00      8748



In [77]:
import joblib

joblib.dump(model, "delay_model.pkl")
joblib.dump(le, "time_encoder.pkl")  # if you used LabelEncoder


['time_encoder.pkl']