In [1]:
# for analysis of data importing necessary libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt   
import seaborn as sns
%matplotlib inline
import missingno as msno

from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder # Ordinal Encoding
from sklearn.preprocessing import LabelEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings    
warnings.filterwarnings("ignore") 

In [2]:
## Data Ingestions step
df=pd.read_csv('data/clean_data.csv')

In [3]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Prepn_Time
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.271464,15
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.229376,10
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.764306,10
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.927795,10
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.373484,15


In [4]:
df.shape

(45584, 13)

In [5]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [6]:
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
45579,32
45580,36
45581,16
45582,26


In [7]:
num_cols = ['Delivery_person_Age', 'Delivery_person_Ratings','distance']

In [8]:
num_cols1 = ['Vehicle_condition','multiple_deliveries','Prepn_Time']

In [9]:
ordinal_catcols = ['Road_traffic_density']

In [10]:
nominal_catcols = ['Weather_conditions','City','Type_of_order','Type_of_vehicle','Festival']

In [11]:
# Define the custom ranking for each ordinal variable
traffic_density_categories = ['Low', 'Medium', 'High','Jam']

In [12]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)
num_pipeline1=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',StandardScaler())

    ]

)
# Categorigal Pipeline
ordinalcat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[traffic_density_categories])),
    ('scaler',StandardScaler())
    ]

)
# Categorigal Pipeline
nominalcat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(handle_unknown = "ignore")),
    ('scaler',StandardScaler(with_mean=False))
    ]

)
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,num_cols),
('num_pipeline1',num_pipeline1,num_cols1),
('ordinalcat_pipeline',ordinalcat_pipeline,ordinal_catcols),
('nominalcat_pipeline',nominalcat_pipeline,nominal_catcols),
])

In [13]:
## Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [14]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [15]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__distance,num_pipeline1__Vehicle_condition,num_pipeline1__multiple_deliveries,num_pipeline1__Prepn_Time,ordinalcat_pipeline__Road_traffic_density,nominalcat_pipeline__Weather_conditions_Cloudy,nominalcat_pipeline__Weather_conditions_Fog,nominalcat_pipeline__Weather_conditions_Sandstorms,...,nominalcat_pipeline__Type_of_order_Buffet,nominalcat_pipeline__Type_of_order_Drinks,nominalcat_pipeline__Type_of_order_Meal,nominalcat_pipeline__Type_of_order_Snack,nominalcat_pipeline__Type_of_vehicle_bicycle,nominalcat_pipeline__Type_of_vehicle_electric_scooter,nominalcat_pipeline__Type_of_vehicle_motorcycle,nominalcat_pipeline__Type_of_vehicle_scooter,nominalcat_pipeline__Festival_No,nominalcat_pipeline__Festival_Yes
0,1.124025,-1.62344,0.155639,-1.223528,-1.318236,0.0874,1.308057,0.0,0.0,2.707135,...,2.320264,0.0,0.0,0.0,0.0,0.0,2.025488,0.0,7.138257,0.0
1,-0.106505,-3.143059,-0.11421,-1.223528,3.943714,-1.041648,1.308057,0.0,0.0,0.0,...,2.320264,0.0,0.0,0.0,0.0,0.0,2.025488,0.0,0.0,7.138257
2,0.245075,0.504025,-1.20154,1.160323,0.435747,0.0874,-1.093916,0.0,0.0,0.0,...,0.0,0.0,2.307572,0.0,0.0,0.0,0.0,2.11791,7.138257,0.0
3,0.596655,-1.319517,1.844238,1.160323,0.435747,1.216448,1.308057,0.0,0.0,0.0,...,0.0,0.0,2.307572,0.0,0.0,0.0,0.0,2.11791,7.138257,0.0
4,0.772445,-1.62344,-1.188474,-1.223528,0.435747,1.216448,-1.093916,0.0,0.0,0.0,...,0.0,0.0,2.307572,0.0,0.0,0.0,2.025488,0.0,7.138257,0.0


In [16]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [17]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [18]:
regression.coef_

array([[ 2.31396406e+00, -2.22469802e+00,  1.80915274e+00,
        -1.69797375e+00,  1.72238722e+00, -1.55805309e-01,
         2.85343698e+00, -3.56019426e+12, -3.68670923e+12,
        -3.53361510e+12, -3.55129350e+12, -3.51103187e+12,
        -3.54098217e+12,  6.36816673e+13,  9.02872098e+12,
         6.33242897e+13, -2.54691903e+13, -2.55654193e+13,
        -2.56092709e+13, -2.57097291e+13, -5.80950384e+12,
        -4.34386815e+13, -7.72916292e+13, -7.39187392e+13,
        -1.04240152e+13, -1.04240152e+13]])

In [19]:
regression.intercept_

array([1.4696191e+14])

In [20]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [21]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 6.113630379053705
MAE: 4.8799539338988005
R2 score 56.78766086865643


Lasso
Model Training Performance
RMSE: 6.74247896909708
MAE: 5.373010377079405
R2 score 47.44081527867902


Ridge
Model Training Performance
RMSE: 6.112931684610096
MAE: 4.879355633182105
R2 score 56.79753732310985


Elasticnet
Model Training Performance
RMSE: 6.682497481245265
MAE: 5.356315392279718
R2 score 48.371794924156134


