### Importing Libraries 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,...,Distance_KM,Pickup_DateTime,Pickup_delay,Distance_KM_ss,Pickup_delay_ss,Agent_Rating_ss,Weather_le,Traffic_enc,Area_le,Vehicle_le
0,ialx566343618,37,4.9,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,...,3.025149,2022-03-19 11:45:00,15.0,-1.189507,1.225636,0.808376,4,3,1,1
1,akqg208421122,34,4.5,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,...,20.18353,2022-03-25 19:50:00,5.0,1.82293,-1.117159,-0.398841,3,4,2,2
2,njpu434582536,23,4.4,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,...,1.552758,2022-03-19 08:45:00,15.0,-1.44801,1.225636,-0.700645,2,1,1,1
3,rjto796129700,38,4.7,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,...,7.790401,2022-04-05 18:10:00,10.0,-0.352889,0.054238,0.204768,4,2,1,1
4,zguw716275638,32,4.6,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,...,6.210138,2022-03-26 13:45:00,15.0,-0.63033,1.225636,-0.097037,0,3,2,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43686 entries, 0 to 43685
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Order_ID             43686 non-null  object 
 1   Agent_Age            43686 non-null  int64  
 2   Agent_Rating         43686 non-null  float64
 3   Order_Date           43686 non-null  object 
 4   Order_Time           43648 non-null  object 
 5   Pickup_Time          43686 non-null  object 
 6   Weather              43686 non-null  object 
 7   Traffic              43686 non-null  object 
 8   Vehicle              43686 non-null  object 
 9   Area                 43686 non-null  object 
 10  Delivery_Time        43686 non-null  int64  
 11  Category             43686 non-null  object 
 12  Order_DateTime       43648 non-null  object 
 13  Store_Longitude_rad  43686 non-null  float64
 14  Store_Latitude_rad   43686 non-null  float64
 15  Drop_Longitude_rad   43686 non-null 

In [5]:
# Dropping Order_Date, Order_Time, Pickup_time as we have there merged column.

In [6]:
df = df.drop(columns=['Order_Date','Order_Time','Pickup_Time'],axis=1)

In [7]:
df.columns 

Index(['Order_ID', 'Agent_Age', 'Agent_Rating', 'Weather', 'Traffic',
       'Vehicle', 'Area', 'Delivery_Time', 'Category', 'Order_DateTime',
       'Store_Longitude_rad', 'Store_Latitude_rad', 'Drop_Longitude_rad',
       'Drop_Latitude_rad', 'Distance_KM', 'Pickup_DateTime', 'Pickup_delay',
       'Distance_KM_ss', 'Pickup_delay_ss', 'Agent_Rating_ss', 'Weather_le',
       'Traffic_enc', 'Area_le', 'Vehicle_le'],
      dtype='object')

In [8]:
# Order_date_time, Pickup_date_time converting them to datetime.

In [9]:
df['Order_DateTime'] = pd.to_datetime(df['Order_DateTime'])
df['Pickup_DateTime'] = pd.to_datetime(df['Pickup_DateTime'])

In [10]:
df.isnull().sum()

Order_ID                0
Agent_Age               0
Agent_Rating            0
Weather                 0
Traffic                 0
Vehicle                 0
Area                    0
Delivery_Time           0
Category                0
Order_DateTime         38
Store_Longitude_rad     0
Store_Latitude_rad      0
Drop_Longitude_rad      0
Drop_Latitude_rad       0
Distance_KM             0
Pickup_DateTime         0
Pickup_delay           38
Distance_KM_ss          0
Pickup_delay_ss        38
Agent_Rating_ss         0
Weather_le              0
Traffic_enc             0
Area_le                 0
Vehicle_le              0
dtype: int64

In [11]:
df = df.dropna(subset=['Order_DateTime'])

In [12]:
# applying label Encoding to Category 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Category'] = le.fit_transform(df['Category'])

### Building Model 

In [14]:
features =['Agent_Age',
    'Agent_Rating_ss',
    'Weather_le',
    'Traffic_enc',
    'Vehicle_le',
    'Area_le',
    'Category',
    'Distance_KM_ss',
    'Pickup_delay_ss'
]

X = df[features]
y = df['Delivery_Time']

In [15]:
# Splitting the data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=123)

#### Linear Regression (baseline)

In [16]:
# Lets begin with Linear Regression 

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, classification_report, r2_score

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MAE: 31.92653224380299
R²: 0.3378785825255922


In [18]:
df['Delivery_Time'].describe()

count    43648.000000
mean       124.914475
std         51.933163
min         10.000000
25%         90.000000
50%        125.000000
75%        160.000000
max        270.000000
Name: Delivery_Time, dtype: float64

#### Random Forest Regressor 

In [19]:
features_forest =['Agent_Age',
    'Agent_Rating',
    'Weather_le',
    'Traffic_enc',
    'Vehicle_le',
    'Area_le',
    'Category',
    'Distance_KM',
    'Pickup_delay'
]

X_forest = df[features]

In [20]:
X_train_forest, X_test_forest, y_train_forest, y_test_forest = train_test_split(X_forest,y,test_size=0.2, random_state=123)

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
rf = RandomForestRegressor(n_estimators=100,min_samples_split=5,min_samples_leaf=2,random_state=123)

In [23]:
rf.fit(X_train_forest,y_train_forest)

In [24]:
y_pred_rf = rf.predict(X_test_forest)

In [25]:
print("Random Forest Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test_forest, y_pred_rf))

Random Forest Results:
MAE: 17.918771599044778
R²: 0.7971601067803586


#### Hyperparameter-tuning for Random forest

In [26]:
# Lets Do hyperparameter tuning 

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators' : [100,200,300],
    'min_samples_leaf' : [2,3,4],
    'min_samples_split' : [5,10,15]
}

In [27]:
rf_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=123),
    param_distributions=param_grid,
    n_iter=20,
    scoring='r2',
    cv= 2,
    random_state=123,
    n_jobs=1
)

In [28]:
rf_search.fit(X_train_forest,y_train_forest)

In [29]:
print("Best RF Parameters: ",rf_search.best_params_)

Best RF Parameters:  {'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 2}


In [30]:
rf_hyper = RandomForestRegressor(n_estimators=300,min_samples_split=15,min_samples_leaf=2,random_state=123)

In [31]:
rf_hyper.fit(X_train_forest,y_train)

In [32]:
y_pred_rf_hyper = rf_hyper.predict(X_test_forest)

In [33]:
print("Random Forest Results:")
print("MAE:", mean_absolute_error(y_test_forest, y_pred_rf_hyper))
print("R²:", r2_score(y_test_forest, y_pred_rf_hyper))

Random Forest Results:
MAE: 17.704299226249084
R²: 0.8036331507066843


In [34]:
# Now Lets do fro Gradient Boosting Regressor 

from sklearn.ensemble import GradientBoostingRegressor

In [35]:
gb = GradientBoostingRegressor(n_estimators=100,loss='absolute_error',learning_rate=0.05,max_depth=5,random_state=123)

In [36]:
gb.fit(X_train_forest,y_train_forest)

In [37]:
y_pred_gb = gb.predict(X_test_forest)

In [38]:
print('Gradient Boosting Results')
print('MAE: ',mean_absolute_error(y_test_forest,y_pred_gb))
print('R2: ',r2_score(y_test_forest,y_pred_gb))

Gradient Boosting Results
MAE:  18.462515771427004
R2:  0.785207011840308


In [39]:
# Saving the trained model using joblib

import joblib


joblib.dump(gb,'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

In [40]:
joblib.dump(model,'linear_regression_model.pkl')


['linear_regression_model.pkl']

In [41]:
joblib.dump(rf,'random_forest_model.pkl')

['random_forest_model.pkl']

In [42]:

joblib.dump(rf_hyper,'random_forest_tuned_model.pkl')

['random_forest_tuned_model.pkl']