In [1]:
import pandas as pd 
import numpy as np
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (LinearRegression,Lasso,ElasticNet,Ridge)
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [2]:
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('data/clean_data.csv')
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Order_pickup_time,Year,Month,Day
0,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,0.5,2022,2,13
1,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,0.1,2022,4,3
2,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,0.1,2022,2,13
3,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,0.55,2022,2,14
4,29.0,4.5,Sandstorms,Jam,2,Buffet,electric_scooter,1.0,No,Metropolitian,20,0.1,2022,2,4


In [80]:
df['City'].value_counts()

Metropolitian    23559
Urban             7126
Semi-Urban          59
Name: City, dtype: int64

In [4]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [5]:
Y.head()

Unnamed: 0,Time_taken (min)
0,23
1,21
2,20
3,41
4,20


In [6]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include=['object','category']).columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [8]:
weather_col = ['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny']
road_trf_col = ['Jam', 'High', 'Medium', 'Low']
type_ord_col = ['Snack', 'Meal', 'Drinks', 'Buffet']
type_vech_col = ['motorcycle', 'scooter', 'electric_scooter', 'bicycle']
festival_col = ['No', 'Yes']
city_col = ['Metropolitian', 'Urban', 'Semi-Urban']

In [9]:
numerical_cols

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'Order_pickup_time', 'Year', 'Month', 'Day'],
      dtype='object')

In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler ,PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
from sklearn.preprocessing import PowerTransformer
# scaler = PowerTransformer(method = 'box-cox')
# transformer = FunctionTransformer(np.log2, validate = True)

In [12]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('Standarize',StandardScaler()),


    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('OneHotencoder',OneHotEncoder(drop='first',sparse=False)),
    ('Standarize',StandardScaler()),

    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [13]:
## Train test split

from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test=train_test_split(X,Y,train_size=0.70,random_state=30)

In [14]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())



In [15]:
# No Na value Now 
X_train.isna().sum()

num_pipeline__Delivery_person_Age                 0
num_pipeline__Delivery_person_Ratings             0
num_pipeline__Vehicle_condition                   0
num_pipeline__multiple_deliveries                 0
num_pipeline__Order_pickup_time                   0
num_pipeline__Year                                0
num_pipeline__Month                               0
num_pipeline__Day                                 0
cat_pipeline__Weather_conditions_Fog              0
cat_pipeline__Weather_conditions_Sandstorms       0
cat_pipeline__Weather_conditions_Stormy           0
cat_pipeline__Weather_conditions_Sunny            0
cat_pipeline__Weather_conditions_Windy            0
cat_pipeline__Road_traffic_density_Jam            0
cat_pipeline__Road_traffic_density_Low            0
cat_pipeline__Road_traffic_density_Medium         0
cat_pipeline__Type_of_order_Drinks                0
cat_pipeline__Type_of_order_Meal                  0
cat_pipeline__Type_of_order_Snack                 0
cat_pipeline

In [16]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Order_pickup_time,num_pipeline__Year,num_pipeline__Month,num_pipeline__Day,cat_pipeline__Weather_conditions_Fog,cat_pipeline__Weather_conditions_Sandstorms,...,cat_pipeline__Road_traffic_density_Medium,cat_pipeline__Type_of_order_Drinks,cat_pipeline__Type_of_order_Meal,cat_pipeline__Type_of_order_Snack,cat_pipeline__Type_of_vehicle_electric_scooter,cat_pipeline__Type_of_vehicle_motorcycle,cat_pipeline__Type_of_vehicle_scooter,cat_pipeline__Festival_Yes,cat_pipeline__City_Semi-Urban,cat_pipeline__City_Urban
0,-1.311544,-0.660873,-1.220275,-1.367578,-0.470554,0.0,-0.121201,1.228795,-0.4504,2.269827,...,-0.563656,1.72721,-0.574862,-0.57862,-0.296976,0.843826,-0.70809,-0.12134,-0.04256,1.850089
1,-1.487547,1.346146,-1.220275,0.526678,-0.470554,0.0,-0.908204,-1.195938,-0.4504,-0.440562,...,-0.563656,1.72721,-0.574862,-0.57862,-0.296976,0.843826,-0.70809,-0.12134,-0.04256,1.850089
2,-0.079526,0.543338,-1.220275,0.526678,-0.463156,0.0,-0.121201,1.439641,-0.4504,-0.440562,...,-0.563656,1.72721,-0.574862,-0.57862,-0.296976,0.843826,-0.70809,-0.12134,-0.04256,-0.540514
3,-1.311544,0.543338,1.19171,-1.367578,-0.396573,0.0,-0.121201,-1.195938,-0.4504,2.269827,...,-0.563656,1.72721,-0.574862,-0.57862,-0.296976,-1.185079,1.41225,-0.12134,-0.04256,-0.540514
4,0.096477,-1.062277,1.19171,0.526678,-0.470554,0.0,-0.908204,-1.195938,-0.4504,2.269827,...,-0.563656,-0.578968,-0.574862,1.728249,-0.296976,-1.185079,1.41225,-0.12134,-0.04256,-0.540514


In [17]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [18]:
df.shape

(31605, 15)

In [19]:
## Train multiple models
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(alpha=0.05,max_iter=10000),
    'Ridge':Ridge(max_iter=1000,solver='svd'),
    'Elasticnet':ElasticNet(alpha=0.05,max_iter=5000),
    'RFR': RandomForestRegressor(max_depth=10, n_estimators=1000, random_state=123,verbose=0)
    
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.119271975902929
MAE: 4.874120401857369
R2 score 52.068039808756495


Lasso
Model Training Performance
RMSE: 6.123309108947694
MAE: 4.874835766512536
R2 score 52.00477361185993


Ridge
Model Training Performance
RMSE: 6.119316188682635
MAE: 4.874148509251259
R2 score 52.067347173142


Elasticnet
Model Training Performance
RMSE: 6.124388897219548
MAE: 4.877167347078672
R2 score 51.98784510113619




  model.fit(X_train,y_train)


RFR
Model Training Performance
RMSE: 5.121045125259645
MAE: 4.012922512418035
R2 score 66.43065379378139




In [20]:
model_list


['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet', 'RFR']

In [21]:
import seaborn as sns
report = {}

for i in range(len(models)):
    model = list(models.values())[i]
    # Train model
    model.fit(X_train,y_train)
    print(model.score(X_train,y_train))
    print(model.score(X_test,y_test))
    print()

    # Predict Testing data
    y_test_pred =model.predict(X_test)
    
    # Get R2 scores for train and test data
    #train_model_score = r2_score(ytrain,y_train_pred)
    test_model_score = r2_score(y_test,y_test_pred)

    report[list(models.keys())[i]] =  test_model_score


0.5342918099696976
0.5206803980875649

0.5334556339092381
0.5200477361185993

0.5342917490405594
0.52067347173142

0.5333278415234382
0.5198784510113619



  model.fit(X_train,y_train)


0.7078462596579354
0.6643065379378139



In [22]:
report

{'LinearRegression': 0.5206803980875649,
 'Lasso': 0.5200477361185993,
 'Ridge': 0.52067347173142,
 'Elasticnet': 0.5198784510113619,
 'RFR': 0.6643065379378139}

In [23]:
y_test_pred.shape

(9482,)

In [24]:
y_test_pred[0]

20.839392847515356

In [28]:
#  Max R2 
BEST_Model_SCORE  = max(sorted(report.values())) 

# Best R2 Index Get
BEST_model_name = list(report.keys())[list(report.values()).index(BEST_Model_SCORE)]

#
best_model = models[BEST_model_name]



RandomForestRegressor(max_depth=10, n_estimators=1000, random_state=123)


In [40]:
predict_new_data = X_test.head(1)
print(best_model.predict(predict_new_data))


[20.83939285]


In [111]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken (min)', 'Order_pickup_time', 'Year', 'Month', 'Day'],
      dtype='object')

In [109]:
best_model.predict(preprocessor.transform(df.drop(columns="Time_taken (min)")))



array([22.75701709, 24.18300742, 26.92147533, ..., 24.17842488,
       29.14778987, 25.86304492])

In [101]:
best_model.predict(preprocessor.transform(df.head(1)))



array([22.75701709])