In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns

%matplotlib inline

# display all the columns of the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/food_delivery.csv')
df.head()

Unnamed: 0,id,delivery_person_id,delivery_person_age,delivery_person_ratings,restaurant_latitude,restaurant_longitude,delivery_location_latitude,delivery_location_longitude,order_date,time_orderd,time_order_picked,weather_conditions,road_traffic_density,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,preparation_time_min,distance_to_delivery_km,time_taken_min
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,2022-12-02 21:55:00,2022-12-02 22:10:00,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,15.0,10.29,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,2022-02-13 14:55:00,2022-02-13 15:05:00,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,10.0,6.25,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,2022-04-03 17:30:00,2022-04-03 17:40:00,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,10.0,13.8,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,2022-02-13 09:20:00,2022-02-13 09:30:00,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,10.0,2.93,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,2022-02-14 19:50:00,2022-02-14 20:05:00,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,15.0,19.42,41


In [3]:
df.drop(columns=['id','delivery_person_id' ,'restaurant_latitude','restaurant_longitude','delivery_location_latitude','delivery_location_longitude','order_date','time_orderd','time_order_picked','type_of_order'], axis=1, inplace= True)
df.head()

Unnamed: 0,delivery_person_age,delivery_person_ratings,weather_conditions,road_traffic_density,vehicle_condition,type_of_vehicle,multiple_deliveries,festival,city,preparation_time_min,distance_to_delivery_km,time_taken_min
0,36.0,4.2,Fog,Jam,2,motorcycle,3.0,No,Metropolitian,15.0,10.29,46
1,21.0,4.7,Stormy,High,1,motorcycle,1.0,No,Metropolitian,10.0,6.25,23
2,23.0,4.7,Sandstorms,Medium,1,scooter,1.0,No,Metropolitian,10.0,13.8,21
3,34.0,4.3,Sandstorms,Low,0,motorcycle,0.0,No,Metropolitian,10.0,2.93,20
4,24.0,4.7,Fog,Jam,1,scooter,1.0,No,Metropolitian,15.0,19.42,41


In [4]:
df.shape

(35690, 12)

In [5]:
df.isnull().sum()

delivery_person_age        175
delivery_person_ratings    212
weather_conditions           0
road_traffic_density         0
vehicle_condition            0
type_of_vehicle              0
multiple_deliveries        768
festival                   178
city                       936
preparation_time_min         0
distance_to_delivery_km      0
time_taken_min               0
dtype: int64

df = df.dropna()

In [6]:
df.shape

(35690, 12)

In [7]:
## independent and dependent features

X = df.drop(labels=['time_taken_min'], axis=1)
Y = df[['time_taken_min']]

In [8]:
X.head()

Unnamed: 0,delivery_person_age,delivery_person_ratings,weather_conditions,road_traffic_density,vehicle_condition,type_of_vehicle,multiple_deliveries,festival,city,preparation_time_min,distance_to_delivery_km
0,36.0,4.2,Fog,Jam,2,motorcycle,3.0,No,Metropolitian,15.0,10.29
1,21.0,4.7,Stormy,High,1,motorcycle,1.0,No,Metropolitian,10.0,6.25
2,23.0,4.7,Sandstorms,Medium,1,scooter,1.0,No,Metropolitian,10.0,13.8
3,34.0,4.3,Sandstorms,Low,0,motorcycle,0.0,No,Metropolitian,10.0,2.93
4,24.0,4.7,Fog,Jam,1,scooter,1.0,No,Metropolitian,15.0,19.42


In [9]:
Y.head()

Unnamed: 0,time_taken_min
0,46
1,23
2,21
3,20
4,41


In [10]:
# define which columns should be ordinal-encoding and which should be scaled

categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(exclude='object').columns
print(categorical_columns)

Index(['weather_conditions', 'road_traffic_density', 'type_of_vehicle',
       'festival', 'city'],
      dtype='object')


In [11]:
numerical_columns

Index(['delivery_person_age', 'delivery_person_ratings', 'vehicle_condition',
       'multiple_deliveries', 'preparation_time_min',
       'distance_to_delivery_km'],
      dtype='object')

In [12]:
for feature in categorical_columns:
    print(f"{feature} : {df[feature].unique()}")
    print(60*'=')

weather_conditions : ['Fog' 'Stormy' 'Sandstorms' 'Windy' 'Cloudy' 'Sunny']
road_traffic_density : ['Jam' 'High' 'Medium' 'Low']
type_of_vehicle : ['motorcycle' 'scooter' 'electric_scooter']
festival : ['No' 'Yes' nan]
city : ['Metropolitian' 'Urban' 'Semi-Urban' nan]


In [13]:
weather_category = ['Sunny','Fog','Cloudy','Windy','Sandstorms','Stormy']
road_traffic_category = ['Low','Medium','High', 'Jam']
vehilcle_type = ['electric_scooter','scooter','motorcycle']
festival_category = ['No', 'Yes']
city_category = ['Urban','Semi-Urban','Metropolitian']

In [14]:
from sklearn.impute import SimpleImputer # to handle missing values
from sklearn.preprocessing import StandardScaler # to handle feature scaling
from sklearn.preprocessing import OrdinalEncoder # to handle ordinal encoding
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
## Numerical pipeline
numerical_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median'))
           ,('scalar', StandardScaler())
           ]
)
numerical_pipeline

In [16]:
##  categorical_pipeline

categorical_pipeline = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='most_frequent'))
        , ('ordinalencoder', OrdinalEncoder(categories=[weather_category,road_traffic_category,vehilcle_type,festival_category,city_category]))
       , ('scaler', StandardScaler())


    ]



)

categorical_pipeline

In [17]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, [i for i in numerical_columns])
    ,('categorical_pipeline', categorical_pipeline, [i for i in categorical_columns])


])
preprocessor

In [18]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state= 30)


In [19]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_train.head()

Unnamed: 0,numerical_pipeline__delivery_person_age,numerical_pipeline__delivery_person_ratings,numerical_pipeline__vehicle_condition,numerical_pipeline__multiple_deliveries,numerical_pipeline__preparation_time_min,numerical_pipeline__distance_to_delivery_km,categorical_pipeline__weather_conditions,categorical_pipeline__road_traffic_density,categorical_pipeline__type_of_vehicle,categorical_pipeline__festival,categorical_pipeline__city
0,0.423585,0.846791,1.227298,0.443958,0.00049,1.302668,-1.471261,-1.107248,-0.783789,-0.141442,-1.869055
1,-0.445037,0.846791,-1.222591,0.443958,-1.223569,-0.376328,0.285351,1.301108,0.770661,-0.141442,0.537759
2,1.639656,0.206989,1.227298,-1.312401,1.224549,-0.102424,-0.300187,-0.304463,0.770661,-0.141442,-1.869055
3,-0.618762,-0.112912,-1.222591,-1.312401,1.224549,-1.468386,-1.471261,-1.107248,0.770661,-0.141442,-1.869055
4,-0.445037,-0.112912,-1.222591,-1.312401,1.224549,0.425819,0.870888,1.301108,0.770661,-0.141442,-1.869055


In [20]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
X_test

Unnamed: 0,numerical_pipeline__delivery_person_age,numerical_pipeline__delivery_person_ratings,numerical_pipeline__vehicle_condition,numerical_pipeline__multiple_deliveries,numerical_pipeline__preparation_time_min,numerical_pipeline__distance_to_delivery_km,categorical_pipeline__weather_conditions,categorical_pipeline__road_traffic_density,categorical_pipeline__type_of_vehicle,categorical_pipeline__festival,categorical_pipeline__city
0,0.423585,-3.311923,1.227298,0.443958,1.224549,0.443605,1.456426,1.301108,-2.338239,7.070058,0.537759
1,0.076136,1.166692,-1.222591,-1.312401,-1.223569,-1.466608,-0.885724,-1.107248,0.770661,-0.141442,0.537759
2,1.292208,0.206989,-1.222591,0.443958,1.224549,1.176387,-0.885724,-0.304463,0.770661,-0.141442,0.537759
3,-0.271313,0.846791,1.227298,0.443958,-1.223569,0.427598,1.456426,-1.107248,-0.783789,-0.141442,0.537759
4,-1.661109,0.206989,-1.222591,-1.312401,1.224549,-0.057959,1.456426,1.301108,0.770661,-0.141442,0.537759
...,...,...,...,...,...,...,...,...,...,...,...
10702,-1.487384,0.206989,-1.222591,-1.312401,0.000490,-0.385221,0.870888,-0.304463,0.770661,-0.141442,0.537759
10703,0.249861,-3.631824,0.002353,0.443958,0.000490,0.383133,1.456426,-0.304463,0.770661,-0.141442,0.537759
10704,-1.313660,-2.032318,-1.222591,-1.312401,-1.223569,-0.931250,0.870888,-1.107248,0.770661,-0.141442,0.537759
10705,-0.618762,-1.072615,1.227298,0.443958,1.224549,-0.929472,-0.300187,1.301108,-0.783789,-0.141442,0.537759


In [21]:
## Model Training

In [22]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [23]:
import numpy as np

def evaluate_model(true, predicted):

    mae = mean_absolute_error(true, predicted)
    mse = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)

    return mae, rmse, r2_square

In [24]:
## train_multiple models

models = {
    'Linearregression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'SupportVectorRegressor': SVR(kernel='linear')
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # make_predictions

    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE', rmse)
    print('MAE', mae)
    print('R2 Score', r2_square*100)

    r2_list.append(r2_square)

    print('='*40)
    print('\n')

Linearregression
Model Training Performance
RMSE 6.417340548904858
MAE 5.073769718428397
R2 Score 53.45972197454223


Lasso
Model Training Performance
RMSE 6.825279534764193
MAE 5.437813007079364
R2 Score 47.354690154586244


Ridge
Model Training Performance
RMSE 6.417339945462555
MAE 5.0737693651164735
R2 Score 53.459730727193254


ElasticNet
Model Training Performance
RMSE 6.911606867810435
MAE 5.52785233702329
R2 Score 46.01453391946785


DecisionTreeRegressor
Model Training Performance
RMSE 5.230401784750799
MAE 4.007471747454936
R2 Score 69.08360103808808




  model.fit(X_train,y_train)


RandomForestRegressor
Model Training Performance
RMSE 3.8745300213910867
MAE 3.0797972132160987
R2 Score 83.03488293264265




  y = column_or_1d(y, warn=True)


SupportVectorRegressor
Model Training Performance
RMSE 6.443342345577655
MAE 5.061890473076858
R2 Score 53.081813885456064


