In [647]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report
import joblib

In [649]:
import numpy as np
from geopy.distance import geodesic

In [651]:
df=pd.read_csv('Dataset.csv')

In [653]:
df['Order_Time']=df['Order_Time'].astype(str).str.strip().replace('NaN', '00:00:00')
df['Pickup_Time']=df['Pickup_Time'].astype(str).str.strip().replace('NaN', '00:00:00')

In [655]:
#time order was placed and picked
df['Order_Hour'] = pd.to_datetime(df['Order_Time'], format='%H:%M:%S', errors='coerce').dt.hour
df['Pickup_Hour'] = pd.to_datetime(df['Pickup_Time'], format='%H:%M:%S', errors='coerce').dt.hour

df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Weekday'] = df['Order_Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday

In [657]:
def calculate_distance(row):
    store = (row['Store_Latitude'], row['Store_Longitude'])
    drop = (row['Drop_Latitude'], row['Drop_Longitude'])
    return geodesic(store, drop).km

df['Distance_km'] = df.apply(calculate_distance, axis=1)

# === Date/Weekday ===
df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df['Weekday'] = df['Order_Date'].dt.dayofweek

# === Maps ===
traffic_map = {'Low': 0, 'Medium': 1, 'High': 2, 'Jam': 3}
weather_map = {'Sunny': 0, 'Cloudy': 1, 'Stormy': 3, 'Sandstorms': 4}

df['Traffic_Level'] = df['Traffic'].map(traffic_map).fillna(0)
df['Weather_Impact'] = df['Weather'].map(weather_map).fillna(0)

# === Delay Flag ===
df['Delay_Flag'] = df['Delivery_Time'].apply(lambda x: 1 if x > 120 else 0)

# === Final Features ===
feature_cols = [
    'Distance_km',
    'Agent_Rating',
    'Traffic_Level',
    'Weather_Impact',
    'Weekday'
]

In [661]:
df = df.dropna(subset=feature_cols + ['Delay_Flag'])

In [663]:
delayed = df[df['Delay_Flag'] == 1]
ontime = df[df['Delay_Flag'] == 0]

delayed_upsampled = resample(
    delayed,
    replace=True,
    n_samples=len(ontime),
    random_state=42
)

df_balanced = pd.concat([ontime, delayed_upsampled])


In [665]:
#from sklearn.preprocessing import LabelEncoder

In [667]:
#le = LabelEncoder()
#df['Time_of_Day_Encoded'] = le.fit_transform(df['Time_of_Day'])

In [669]:
#dict(zip(le.classes_, le.transform(le.classes_)))

In [671]:
df.to_csv('final_dataset.csv', index=False)

In [672]:
X = df_balanced[feature_cols]
y = df_balanced['Delay_Flag']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [675]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    class_weight='balanced',
    random_state=23
)


In [679]:

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(model.feature_importances_) 


[0.43838874 0.40931129 0.         0.11406005 0.03823993]


In [680]:
importances = model.feature_importances_
for feature, importance in zip(feature_cols, importances):
    print(f"{feature}: {round(importance, 4)}")

Distance_km: 0.4384
Agent_Rating: 0.4093
Traffic_Level: 0.0
Weather_Impact: 0.1141
Weekday: 0.0382


In [683]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.corr(), annot=True, cmap='coolwarm')


ValueError: could not convert string to float: 'ialx566343618'

In [685]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6760084427767354
Precision: 0.7005479452054795
Recall: 0.6049207475751124
F1 Score: 0.6492319410943252

Confusion Matrix:
 [[3208 1093]
 [1670 2557]]

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.75      0.70      4301
           1       0.70      0.60      0.65      4227

    accuracy                           0.68      8528
   macro avg       0.68      0.68      0.67      8528
weighted avg       0.68      0.68      0.67      8528



In [687]:
import joblib

joblib.dump(model, "delay_model.pkl")
joblib.dump(le, "time_encoder.pkl") 


['time_encoder.pkl']