***** importing dependencies *****

In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

***** importing dataset *****

In [27]:
deliv_df = pd.read_csv('deliverytimehistory.txt')
deliv_df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Type_of_order,Type_of_vehicle,Time_taken(min)
0,4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,Snack,motorcycle,24
1,B379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,Snack,scooter,33
2,5D6D,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,Drinks,motorcycle,26
3,7A6A,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,Buffet,motorcycle,21
4,70A2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,Snack,scooter,30


***** pre-processing dataset *****

* We start by dropping the ID column since it doesn't contain any useful information

In [28]:
# dropping the ID column
deliv_df.drop('ID', axis=1, inplace=True)
deliv_df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Type_of_order,Type_of_vehicle,Time_taken(min)
0,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,Snack,motorcycle,24
1,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,Snack,scooter,33
2,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,Drinks,motorcycle,26
3,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,Buffet,motorcycle,21
4,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,Snack,scooter,30


* Next we add a column for distance as we did during EDA. We employ the geopy library that can calculate accurate distances given latitudes and longitudes

In [29]:
from geopy.distance import geodesic

# Calculate the distance between restaurant and delivery location
deliv_df['Distance'] = deliv_df.apply(lambda row: geodesic((row['Restaurant_latitude'], row['Restaurant_longitude']),
                                                            (row['Delivery_location_latitude'], row['Delivery_location_longitude'])).kilometers, axis=1)
deliv_df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Type_of_order,Type_of_vehicle,Time_taken(min),Distance
0,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,Snack,motorcycle,24,3.020737
1,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,Snack,scooter,33,20.143737
2,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,Drinks,motorcycle,26,1.549693
3,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,Buffet,motorcycle,21,7.774497
4,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,Snack,scooter,30,6.197898


* Now we need to handle the non numerical columns
    - Using the categorical features as is would not be suitable since the Deep Learning Model would not be able to interpret them properly
    - We will use one hot encoding for the Type_of_order and Type_of_vehicle because they have low cardinality and are not ordered in any way
    - The Delivery_person_ID is somewhat more complicated since it has high cardinality so one hot encoding would create a highly dimensional and sparse feature space
    - Consequently, we will use feature hashing in this case

In [30]:
# Perform one-hot encoding on categorical columns
deliv_df_encoded = pd.get_dummies(deliv_df, columns=["Type_of_vehicle", "Type_of_order"])

# Display the encoded DataFrame
deliv_df_encoded.head()


Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Time_taken(min),Distance,Type_of_vehicle_bicycle,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter,Type_of_order_Buffet,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack
0,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,24,3.020737,0,0,1,0,0,0,0,1
1,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,33,20.143737,0,0,0,1,0,0,0,1
2,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,26,1.549693,0,0,1,0,0,1,0,0
3,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,21,7.774497,0,0,1,0,1,0,0,0
4,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,30,6.197898,0,0,0,1,0,0,0,1


In [31]:
from sklearn.feature_extraction import FeatureHasher

# Create an instance of FeatureHasher
hasher = FeatureHasher(n_features=10, input_type='string')

# Extract the Delivery_person_ID column as a list of strings
delivery_person_ids = deliv_df_encoded['Delivery_person_ID'].astype(str).tolist()

# Apply feature hashing to the Delivery_person_ID column
hashed_features = hasher.transform(delivery_person_ids)

# Convert the hashed features to a NumPy array
hashed_features_array = hashed_features.toarray()

# Create a new DataFrame with the hashed features
hashed_df = pd.DataFrame(hashed_features_array, columns=['hashed_feature_{}'.format(i) for i in range(10)])

# Drop the original Delivery_person_ID column
deliv_df_encoded.drop('Delivery_person_ID', axis=1, inplace=True)

# Concatenate the hashed DataFrame with the original DataFrame
processed_df = pd.concat([deliv_df_encoded, hashed_df], axis=1)
processed_df.head()


Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Time_taken(min),Distance,Type_of_vehicle_bicycle,Type_of_vehicle_electric_scooter,...,hashed_feature_0,hashed_feature_1,hashed_feature_2,hashed_feature_3,hashed_feature_4,hashed_feature_5,hashed_feature_6,hashed_feature_7,hashed_feature_8,hashed_feature_9
0,37,4.9,22.745049,75.892471,22.765049,75.912471,24,3.020737,0,0,...,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,-1.0,0.0
1,34,4.5,12.913041,77.683237,13.043041,77.813237,33,20.143737,0,0,...,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,-1.0,1.0
2,23,4.4,12.914264,77.6784,12.924264,77.6884,26,1.549693,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,1.0
3,38,4.7,11.003669,76.976494,11.053669,77.026494,21,7.774497,0,0,...,2.0,-1.0,0.0,0.0,1.0,0.0,1.0,1.0,-1.0,0.0
4,32,4.6,12.972793,80.249982,13.012793,80.289982,30,6.197898,0,0,...,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,1.0


* Finally, we have to split our dataset into the train and test set

In [32]:
from sklearn.model_selection import train_test_split

# Separate the features and target variable
X = np.array(processed_df.drop('Time_taken(min)', axis=1))
y = np.array(processed_df['Time_taken(min)'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***** training the model *****

In [33]:
from sklearn.model_selection import train_test_split

# Separate the features and target variable
X = processed_df.drop('Time_taken(min)', axis=1)
y = processed_df['Time_taken(min)']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

* We will use Tensorflow to train the model

In [36]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error

# Define the model architecture
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
num_epochs = 100
model.fit(X_train, y_train, epochs=num_epochs, batch_size=32, verbose=0)

# Evaluate the model on the test set
y_pred = model.predict(X_test).squeeze()

# Calculate evaluation metrics (e.g., RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 7.660766887753134


In [37]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics (e.g., RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 7.3673536890513684
