In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics

In [57]:
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [37]:
# !pip install geopy

In [38]:
from geopy.distance import geodesic

In [39]:
#Load Dataset

In [40]:
import os
os.chdir('C:\\Users\\navee\\IPBL\\week4')

In [41]:
df_train = pd.read_csv('deliverytime.txt')

In [42]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45501 entries, 0 to 45500
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45501 non-null  object 
 1   Delivery_person_ID           45501 non-null  object 
 2   Delivery_person_Age          45501 non-null  int64  
 3   Delivery_person_Ratings      45501 non-null  float64
 4   Restaurant_latitude          45501 non-null  float64
 5   Restaurant_longitude         45501 non-null  float64
 6   Delivery_location_latitude   45501 non-null  float64
 7   Delivery_location_longitude  45501 non-null  float64
 8   Type_of_order                45501 non-null  object 
 9   Type_of_vehicle              45501 non-null  object 
 10  Time_taken(min)              45501 non-null  int64  
dtypes: float64(5), int64(2), object(4)
memory usage: 3.8+ MB


In [43]:
#Summary Statistics for numerical columns
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Delivery_person_Age,45501.0,29.54412,5.697346,15.0,25.0,29.0,34.0,50.0
Delivery_person_Ratings,45501.0,4.63234,0.327829,1.0,4.6,4.7,4.8,6.0
Restaurant_latitude,45501.0,17.020161,8.184361,-30.905562,12.933284,18.55144,22.728163,30.914057
Restaurant_longitude,45501.0,70.238395,22.870607,-88.366217,73.17,75.897429,78.044095,88.433452
Delivery_location_latitude,45501.0,17.467833,7.333677,0.01,12.989096,18.634382,22.785207,31.054057
Delivery_location_longitude,45501.0,70.850475,21.112215,0.01,73.280283,76.002574,78.107044,88.563452
Time_taken(min),45501.0,26.29395,9.384537,10.0,19.0,26.0,32.0,54.0


In [44]:
#Summary Statistics for non-numerical columns
df_train.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
ID,45501,45359,6E+09,3
Delivery_person_ID,45501,1320,JAPRES11DEL02,67
Type_of_order,45501,4,Snack,11506
Type_of_vehicle,45501,4,motorcycle,26383


#Key Observations
The Time_Ordered column contains NaN values that require handling.
We need to explore other columns for potential null values.
The Time_taken(min) column should be converted to a numerical value instead of an object.
We need to address the object datatypes in the dataset.
Date and time features also require handling and preprocessing.

In [45]:
#Update Data Types
def update_datatype(df):
    #Update datatype from object to float
    df_train['Delivery_person_Age'] = df_train['Delivery_person_Age'].astype('float64')
    df_train['Delivery_person_Ratings'] = df_train['Delivery_person_Ratings'].astype('float64')
   
    
update_datatype(df_train)

In [46]:
#Drop columns
df_train.drop(['ID','Delivery_person_ID'],axis=1,inplace=True)

In [47]:
#Check for duplicate values
if (len(df_train[df_train.duplicated()])>0):
    print("There are Duplicate values present")
else:
    print("There is no duplicate value present")

There are Duplicate values present


In [48]:
#Handle Missing Values
#Replace NaN to np.nan
df_train.replace('NaN', float(np.nan), regex=True,inplace=True)
#Show count of NaN values in data
df_train.isnull().sum().sort_values(ascending=False)

Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Type_of_order                  0
Type_of_vehicle                0
Time_taken(min)                0
dtype: int64

In [49]:
#Handle null values
def handle_null_values(df):
    df['Delivery_person_Age'].fillna(np.random.choice(df['Delivery_person_Age']), inplace=True)
    df['Delivery_person_Ratings'].fillna(df['Delivery_person_Ratings'].median(), inplace=True)
    
handle_null_values(df_train)
df_train.isnull().sum()

Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Type_of_order                  0
Type_of_vehicle                0
Time_taken(min)                0
dtype: int64

In [50]:
#Calculate distance between restaurant location & delivery location
def calculate_distance(df):
    df['distance']=np.zeros(len(df))
    restaurant_coordinates=df[['Restaurant_latitude','Restaurant_longitude']].to_numpy()
    delivery_location_coordinates=df[['Delivery_location_latitude','Delivery_location_longitude']].to_numpy()
    df['distance'] = np.array([geodesic(restaurant, delivery) for restaurant, delivery in zip(restaurant_coordinates, delivery_location_coordinates)])
    df['distance']= df['distance'].astype("str").str.extract('(\d+)').astype("int64")
    
calculate_distance(df_train)

In [51]:
#Categorical Feature Encoding
def label_encoding(df):
    categorical_columns = df.select_dtypes(include='object').columns
    label_encoder = LabelEncoder()
    df[categorical_columns] = df[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

label_encoding(df_train)

In [52]:
#Split Training & Test Data
X = df_train.drop('Time_taken(min)', axis=1)  # Features
y = df_train['Time_taken(min)']  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
#Data Standardisation
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [72]:
from sklearn.model_selection import GridSearchCV#Build the Regrssion Model
models = [LinearRegression(),DecisionTreeRegressor(),XGBRegressor()]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300]},
    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.18975401022190078

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.4047349097675633

XGBRegressor:
Best parameters: {'n_estimators': 100}
Best R2 score: 0.3796120216647073



In [77]:
from sklearn.model_selection import GridSearchCV#Build the Regrssion Model
models = [RandomForestRegressor()]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300]},
    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

RandomForestRegressor:
Best parameters: {}
Best R2 score: 0.3284873427489584



In [79]:
from sklearn.model_selection import GridSearchCV#Build the Regrssion Model
models = [LinearRegression(),DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor()]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300]},
    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.18975401022190078

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.4051245167713658

RandomForestRegressor:
Best parameters: {'n_estimators': 300}
Best R2 score: 0.3326701385557938

XGBRegressor:
Best parameters: {'max_depth': 5, 'n_estimators': 20}
Best R2 score: 0.40880892576448974



In [74]:
# Create a XGB regressor model
model = XGBRegressor(n_estimators=20,max_depth=9)

# Fit the model on the training data
model.fit(X_train, y_train)

In [75]:
#Evaluate Model
# Make predictions on the test data
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))

Mean Absolute Error (MAE): 5.77
Mean Squared Error (MSE): 53.69
Root Mean Squared Error (RMSE): 7.33
R-squared (R2) Score: 0.38
