In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
data = pd.read_csv('Food_Delivery_Times.csv')

In [8]:
data

Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,522,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,738,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,741,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,661,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,412,19.03,Clear,Low,Morning,Bike,16,5.0,68
...,...,...,...,...,...,...,...,...,...
995,107,8.50,Clear,High,Evening,Car,13,3.0,54
996,271,16.28,Rainy,Low,Morning,Scooter,8,9.0,71
997,861,15.62,Snowy,High,Evening,Scooter,26,2.0,81
998,436,14.17,Clear,Low,Afternoon,Bike,8,0.0,55


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [10]:
data.describe()

Unnamed: 0,Order_ID,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
count,1000.0,1000.0,1000.0,970.0,1000.0
mean,500.5,10.05997,16.982,4.579381,56.732
std,288.819436,5.696656,7.204553,2.914394,22.070915
min,1.0,0.59,5.0,0.0,8.0
25%,250.75,5.105,11.0,2.0,41.0
50%,500.5,10.19,17.0,5.0,55.5
75%,750.25,15.0175,23.0,7.0,71.0
max,1000.0,19.99,29.0,9.0,153.0


In [11]:
# missing value
# scaling
# feature engineering (selection, extraction, ...)
# encoding (Weather, Traffic_Level, Time_of_Day, Vehicle_Type)

In [12]:
data.duplicated().sum()

np.int64(0)

# Drop "Order_ID" column

In [13]:
data.drop('Order_ID', axis = 1, inplace = True)

In [14]:
data

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,19.03,Clear,Low,Morning,Bike,16,5.0,68
...,...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0,54
996,16.28,Rainy,Low,Morning,Scooter,8,9.0,71
997,15.62,Snowy,High,Evening,Scooter,26,2.0,81
998,14.17,Clear,Low,Afternoon,Bike,8,0.0,55


# Splitting data to Features and Target

In [15]:
data_target = data['Delivery_Time_min']

In [16]:
data.drop('Delivery_Time_min', axis = 1, inplace = True)

In [17]:
data

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


# Handling missing values

In [18]:
data1 = data.copy()

In [19]:
data.isna().sum()

Distance_km                0
Weather                   30
Traffic_Level             30
Time_of_Day               30
Vehicle_Type               0
Preparation_Time_min       0
Courier_Experience_yrs    30
dtype: int64

In [20]:
data['Weather'].value_counts()

Weather
Clear    470
Rainy    204
Foggy    103
Snowy     97
Windy     96
Name: count, dtype: int64

In [21]:
data1

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [22]:
data1['Weather'].fillna(value = 'Clear', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Weather'].fillna(value = 'Clear', inplace = True)


In [23]:
data1.isna().sum()

Distance_km                0
Weather                    0
Traffic_Level             30
Time_of_Day               30
Vehicle_Type               0
Preparation_Time_min       0
Courier_Experience_yrs    30
dtype: int64

In [24]:
print(data1['Traffic_Level'].mode()[0])
print(data1['Time_of_Day'].mode()[0])

Medium
Morning


In [25]:
data1['Traffic_Level'].fillna(value = 'Medium', inplace = True)
data1['Time_of_Day'].fillna(value = 'Morning', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Traffic_Level'].fillna(value = 'Medium', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Time_of_Day'].fillna(value = 'Morning', inplace = True)


In [26]:
data1.isna().sum()

Distance_km                0
Weather                    0
Traffic_Level              0
Time_of_Day                0
Vehicle_Type               0
Preparation_Time_min       0
Courier_Experience_yrs    30
dtype: int64

# Handling Courier_Experience_yrs missing with KNNImputer

KNNimputer is a scikit-learn class used to fill out or predict the missing values in a dataset. It is a more useful method that works on the basic approach of the KNN algorithm rather than the naive approach of filling all the values with the mean or the median. In this approach, we specify a distance from the missing values which is also known as the K parameter. The missing value will be predicted about the mean of the neighbors.

How Does KNNImputer Work?
The KNNImputer works by finding the k-nearest neighbors (based on a specified distance metric) for the data points with missing values. It then imputes the missing values using the mean or median (depending on the specified strategy) of the neighboring data points. The key advantage of this approach is that it preserves the relationships between features, which can lead to better model performance.

For example, consider a dataset with a missing value in a column representing a student’s math score. Instead of simply filling this missing value with the overall mean or median of the math scores, KNNImputer finds the k-nearest students (based on other features like scores in physics, chemistry, etc.) and imputes the missing value using the mean or median of these neighbors' math scores.

In [27]:
from sklearn.impute import KNNImputer

In [28]:
imputer = KNNImputer()
data_cleared = imputer.fit_transform(data1)

ValueError: could not convert string to float: 'Windy'

Because of the dataset contains object values so we can not calculate the ditance 

In [None]:
# scaling
# feature engineering (selection, extraction, ...)
# encoding (Weather, Traffic_Level, Time_of_Day, Vehicle_Type)

In [35]:
data1

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [None]:
data1['Courier_Experience_yrs'].value_counts()

Courier_Experience_yrs
6.0    109
9.0    108
1.0    107
8.0    101
2.0     99
4.0     94
7.0     91
0.0     91
5.0     90
3.0     80
Name: count, dtype: int64

In [37]:
med = data1['Courier_Experience_yrs'].median()

In [38]:
data1['Courier_Experience_yrs'].fillna(value = med, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Courier_Experience_yrs'].fillna(value = med, inplace = True)


In [39]:
data1

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [None]:
data1.isna().sum()

Distance_km               0
Weather                   0
Traffic_Level             0
Time_of_Day               0
Vehicle_Type              0
Preparation_Time_min      0
Courier_Experience_yrs    0
dtype: int64

# Encoding

For (Traffic_Level, Time_of_Day) we use Ordinal encoding, (Weather, Vehicle_Type) we use Label encoding
One-hot-encoding 

In [67]:
from sklearn.preprocessing import OrdinalEncoder
ord = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
encoder_traffic = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
data1['Traffic_encoded'] = encoder_traffic.fit_transform(data1[['Traffic_Level']])

KeyError: "None of [Index(['Traffic_Level'], dtype='object')] are in the [columns]"

In [68]:
joblib.dump(ord, 'ord_final_project.pkl')

['ord_final_project.pkl']

In [69]:
data1

Unnamed: 0,Distance_km,Weather,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos
0,7.93,Windy,Scooter,12,1.0,0.0,1.000000e+00,6.123234e-17
1,16.42,Clear,Bike,20,2.0,1.0,1.224647e-16,-1.000000e+00
2,9.52,Foggy,Scooter,28,1.0,0.0,-1.000000e+00,-1.836970e-16
3,7.44,Rainy,Scooter,5,1.0,1.0,1.000000e+00,6.123234e-17
4,19.03,Clear,Bike,16,5.0,0.0,0.000000e+00,1.000000e+00
...,...,...,...,...,...,...,...,...
995,8.50,Clear,Car,13,3.0,2.0,1.224647e-16,-1.000000e+00
996,16.28,Rainy,Scooter,8,9.0,0.0,0.000000e+00,1.000000e+00
997,15.62,Snowy,Scooter,26,2.0,2.0,1.224647e-16,-1.000000e+00
998,14.17,Clear,Bike,8,0.0,0.0,1.000000e+00,6.123234e-17


In [42]:
data1['Time_of_Day'].value_counts()

Time_of_Day
Morning      338
Evening      293
Afternoon    284
Night         85
Name: count, dtype: int64

In [43]:
time_mapping = {
    'Morning': 0,
    'Afternoon': 1,
    'Evening': 2,
    'Night': 3
}
data1['Time_numeric'] = data1['Time_of_Day'].map(time_mapping)


In [44]:
# Total number of categories
max_val = 4

# Apply sine and cosine transformations
data1['Time_sin'] = np.sin(2 * np.pi * data1['Time_numeric'] / max_val)
data1['Time_cos'] = np.cos(2 * np.pi * data1['Time_numeric'] / max_val)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class TimeOfDayEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_mapping = {
            'Morning': 0,
            'Afternoon': 1,
            'Evening': 2,
            'Night': 3
        }
        self.max_val = 4

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        X = X.copy()
        X['Time_numeric'] = X['Time_of_Day'].map(self.time_mapping)
        X['Time_sin'] = np.sin(2 * np.pi * X['Time_numeric'] / self.max_val)
        X['Time_cos'] = np.cos(2 * np.pi * X['Time_numeric'] / self.max_val)
        return X

In [71]:
encoder_time = TimeOfDayEncoder()
joblib.dump(encoder_time, 'time_encoder.pkl')


['time_encoder.pkl']

In [45]:
data1.drop(['Time_of_Day', 'Time_numeric', 'Traffic_Level'], axis=1, inplace=True)

In [46]:
data1

Unnamed: 0,Distance_km,Weather,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos
0,7.93,Windy,Scooter,12,1.0,0.0,1.000000e+00,6.123234e-17
1,16.42,Clear,Bike,20,2.0,1.0,1.224647e-16,-1.000000e+00
2,9.52,Foggy,Scooter,28,1.0,0.0,-1.000000e+00,-1.836970e-16
3,7.44,Rainy,Scooter,5,1.0,1.0,1.000000e+00,6.123234e-17
4,19.03,Clear,Bike,16,5.0,0.0,0.000000e+00,1.000000e+00
...,...,...,...,...,...,...,...,...
995,8.50,Clear,Car,13,3.0,2.0,1.224647e-16,-1.000000e+00
996,16.28,Rainy,Scooter,8,9.0,0.0,0.000000e+00,1.000000e+00
997,15.62,Snowy,Scooter,26,2.0,2.0,1.224647e-16,-1.000000e+00
998,14.17,Clear,Bike,8,0.0,0.0,1.000000e+00,6.123234e-17


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data1['Weather_encoded'] = le.fit_transform(data['Weather'])

In [None]:
data1['Vehicle_Type_encoded'] = le.fit_transform(data['Vehicle_Type'])

In [34]:
data1

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [None]:
data1.drop(['Weather', "Vehicle_Type"], inplace = True, axis = 1)

In [33]:
data1

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [None]:
data_target

0      43
1      84
2      59
3      37
4      68
       ..
995    54
996    71
997    81
998    55
999    58
Name: Delivery_Time_min, Length: 1000, dtype: int64

# Scaling

Using StandardScaler 

In [None]:
col = data1.columns

In [None]:
col

Index(['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs',
       'Traffic_encoded', 'Time_sin', 'Time_cos', 'Weather_encoded',
       'Vehicle_Type_encoded'],
      dtype='object')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data1)
scaled_data = pd.DataFrame(scaled_data, columns =col)

In [None]:
scaled_data

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_encoded,Vehicle_Type_encoded
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741,1.759229,1.373492
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649,-0.882256,-0.913755
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741,-0.221885,1.373492
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741,0.438486,1.373492
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167,-0.882256,-0.913755
...,...,...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649,-0.882256,0.229868
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167,0.438486,1.373492
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649,1.098858,1.373492
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741,-0.882256,-0.913755


In [75]:
data

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [47]:
df = data1[["Distance_km", "Preparation_Time_min", "Courier_Experience_yrs", "Traffic_encoded", "Time_sin", "Time_cos"]]

In [48]:
df

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos
0,7.93,12,1.0,0.0,1.000000e+00,6.123234e-17
1,16.42,20,2.0,1.0,1.224647e-16,-1.000000e+00
2,9.52,28,1.0,0.0,-1.000000e+00,-1.836970e-16
3,7.44,5,1.0,1.0,1.000000e+00,6.123234e-17
4,19.03,16,5.0,0.0,0.000000e+00,1.000000e+00
...,...,...,...,...,...,...
995,8.50,13,3.0,2.0,1.224647e-16,-1.000000e+00
996,16.28,8,9.0,0.0,0.000000e+00,1.000000e+00
997,15.62,26,2.0,2.0,1.224647e-16,-1.000000e+00
998,14.17,8,0.0,0.0,1.000000e+00,6.123234e-17


In [62]:
col1 = df.columns

In [63]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns =col1)

In [64]:
joblib.dump(scaler, 'scaler_final_project.pkl')

['scaler_final_project.pkl']

In [None]:
scaled_df

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167
...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741


In [51]:
df_wv = data[["Weather", "Vehicle_Type"]]

In [52]:
df_wv

Unnamed: 0,Weather,Vehicle_Type
0,Windy,Scooter
1,Clear,Bike
2,Foggy,Scooter
3,Rainy,Scooter
4,Clear,Bike
...,...,...
995,Clear,Car
996,Rainy,Scooter
997,Snowy,Scooter
998,Clear,Bike


In [53]:
df_wv['Weather'].fillna(value = 'Clear', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_wv['Weather'].fillna(value = 'Clear', inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wv['Weather'].fillna(value = 'Clear', inplace = True)


In [54]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).set_output(transform = 'pandas')
df_ = ohe.fit_transform(df_wv[["Weather", "Vehicle_Type"]])

In [59]:
import joblib
joblib.dump(ohe, 'ohe_final_project.pkl')

['ohe_final_project.pkl']

In [55]:
df_

Unnamed: 0,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
995,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
997,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
998,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
df_final = pd.concat([scaled_df, df_], axis = 1)

In [None]:
df_final

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Feature Engineering

->Interaction Features

Effective_Speed = Distance_km / (Preparation_Time_min + Delivery_Time_estimate)

Traffic × Distance → deliveries in traffic are worse if distance is long.

Weather × Vehicle_Type → e.g., rain affects bikes more than cars.

->Polynomial Features (if using linear models)

Distance^2, PrepTime^2, etc. to capture non-linear effects.

->Domain-Specific Features

Is_Long_Distance = Distance_km > 10

Is_Peak_Hour from Time_sin/Time_cos.

Experience_Level = (Courier_Experience_yrs < 1 ? "newbie" : "experienced").

->Ratios

PrepTime / Distance = efficiency of restaurant prep relative to trip length.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# ----------------------------
# Feature Engineering Class
# ----------------------------
class FeatureEngineerLean(BaseEstimator, TransformerMixin):
    def __init__(self, distance_col="Distance_km", prep_col="Preparation_Time_min",
                 exp_col="Courier_Experience_yrs", traffic_col="Traffic_encoded",
                 time_sin="Time_sin", time_cos="Time_cos",
                 weather_cols=None, vehicle_cols=None):
        self.distance_col = distance_col
        self.prep_col = prep_col
        self.exp_col = exp_col
        self.traffic_col = traffic_col
        self.time_sin = time_sin
        self.time_cos = time_cos
        self.weather_cols = weather_cols or [
            "Weather_Clear", "Weather_Foggy", "Weather_Rainy", "Weather_Snowy", "Weather_Windy"
        ]
        self.vehicle_cols = vehicle_cols or [
            "Vehicle_Type_Bike", "Vehicle_Type_Car", "Vehicle_Type_Scooter"
        ]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()

        # Effective Speed
        X["Effective_Speed"] = X[self.distance_col] / (X[self.prep_col] + 1e-6)

        # Traffic × Distance
        X["Traffic_Distance"] = X[self.distance_col] * X[self.traffic_col]

        # Domain-specific
        X["Is_Long_Distance"] = (X[self.distance_col] > 10).astype(int)
        X["Is_Peak_Hour"] = ((X[self.time_sin] > 0.7) | (X[self.time_cos] < -0.7)).astype(int)

        # Experience buckets
        X["Is_Newbie"] = (X[self.exp_col] < 1).astype(int)
        X["Is_Experienced"] = (X[self.exp_col] >= 1).astype(int)

        # Ratios
        X["PrepTime_per_Distance"] = X[self.prep_col] / (X[self.distance_col] + 1e-6)

        # Compact Weather × Vehicle interactions
        bad_weather = X["Weather_Rainy"] + X["Weather_Snowy"] + X["Weather_Windy"]
        fog_snow = X["Weather_Foggy"] + X["Weather_Snowy"]
        X["Bike_in_BadWeather"] = X["Vehicle_Type_Bike"] * bad_weather
        X["Car_in_BadWeather"] = X["Vehicle_Type_Car"] * fog_snow

        return X

In [None]:
 # Build pipeline
feature_pipeline = Pipeline(steps=[
        ("feature_engineering", FeatureEngineerLean())])

# Transform dataset
transformed = feature_pipeline.fit_transform(df_final)

# Show results
print("Shape before:", df_final.shape)
print("Shape after:", transformed.shape)
print(transformed.head())

Shape before: (1000, 14)
Shape after: (1000, 23)
   Distance_km  Preparation_Time_min  Courier_Experience_yrs  Traffic_encoded  \
0    -0.374085             -0.691853               -1.251672        -1.102212   
1     1.117008              0.419111               -0.903211         0.251857   
2    -0.094835              1.530076               -1.251672        -1.102212   
3    -0.460144             -1.663947               -1.251672         0.251857   
4     1.575401             -0.136371                0.142172        -1.102212   

   Time_sin  Time_cos  Weather_Clear  Weather_Foggy  Weather_Rainy  \
0  1.395633 -0.056741            0.0            0.0            0.0   
1 -0.346730 -1.317649            1.0            0.0            0.0   
2 -2.089094 -0.056741            0.0            1.0            0.0   
3  1.395633 -0.056741            0.0            0.0            1.0   
4 -0.346730  1.204167            1.0            0.0            0.0   

   Weather_Snowy  ...  Vehicle_Type_Scooter

In [None]:
df_final

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
transformed

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,...,Vehicle_Type_Scooter,Effective_Speed,Traffic_Distance,Is_Long_Distance,Is_Peak_Hour,Is_Newbie,Is_Experienced,PrepTime_per_Distance,Bike_in_BadWeather,Car_in_BadWeather
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741,0.0,0.0,0.0,0.0,...,1.0,0.540701,0.412322,0,1,1,0,1.849457,0.0,0.0
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649,1.0,0.0,0.0,0.0,...,0.0,2.665176,0.281326,0,1,1,0,0.375208,0.0,0.0
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741,0.0,1.0,0.0,0.0,...,1.0,-0.061980,0.104528,0,0,1,0,-16.134319,0.0,0.0
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741,0.0,0.0,1.0,0.0,...,1.0,0.276538,-0.115890,0,1,1,0,3.616154,0.0,0.0
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167,1.0,0.0,0.0,0.0,...,0.0,-11.552412,-1.736427,0,0,1,0,-0.086563,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649,1.0,0.0,0.0,0.0,...,0.0,0.495453,-0.439986,0,1,1,0,2.018364,0.0,0.0
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167,0.0,0.0,1.0,0.0,...,1.0,-0.875804,-1.204079,0,0,0,1,-1.141808,0.0,0.0
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649,0.0,0.0,0.0,1.0,...,1.0,0.779747,1.568195,0,1,1,0,1.282465,0.0,0.0
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741,1.0,0.0,0.0,0.0,...,0.0,-0.578708,-0.795623,0,1,1,0,-1.727987,0.0,0.0


In [None]:
transformed.drop(['Is_Long_Distance', 'Is_Peak_Hour', 'Is_Newbie', 'Is_Experienced', 'Bike_in_BadWeather', 'Car_in_BadWeather'], axis = 1, inplace = True)

In [None]:
transformed

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter,Effective_Speed,Traffic_Distance,PrepTime_per_Distance
0,-0.374085,-0.691853,-1.251672,-1.102212,1.395633,-0.056741,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.540701,0.412322,1.849457
1,1.117008,0.419111,-0.903211,0.251857,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.665176,0.281326,0.375208
2,-0.094835,1.530076,-1.251672,-1.102212,-2.089094,-0.056741,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.061980,0.104528,-16.134319
3,-0.460144,-1.663947,-1.251672,0.251857,1.395633,-0.056741,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.276538,-0.115890,3.616154
4,1.575401,-0.136371,0.142172,-1.102212,-0.346730,1.204167,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.552412,-1.736427,-0.086563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.273977,-0.552983,-0.554750,1.605926,-0.346730,-1.317649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.495453,-0.439986,2.018364
996,1.092420,-1.247335,1.536016,-1.102212,-0.346730,1.204167,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.875804,-1.204079,-1.141808
997,0.976505,1.252335,-0.903211,1.605926,-0.346730,-1.317649,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.779747,1.568195,1.282465
998,0.721842,-1.247335,-1.600132,-1.102212,1.395633,-0.056741,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.578708,-0.795623,-1.727987


# Training

In [72]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X = transformed
y = data_target

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
regr = LinearRegression()

regr.fit(X_train, y_train)
y_val_pred = regr.predict(X_val)
y_test_pred = regr.predict(X_test)

NameError: name 'transformed' is not defined

In [None]:
# Validation metrics
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = val_mse ** 0.5
val_r2 = r2_score(y_val, y_val_pred)

# Test metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = test_mse ** 0.5
test_r2 = r2_score(y_test, y_test_pred)

print(f"Validation RMSE: {val_rmse:.2f}, R²: {val_r2:.2f}")
print(f"Test RMSE: {test_rmse:.2f}, R²: {test_r2:.2f}")


Validation RMSE: 10.25, R²: 0.82
Test RMSE: 7.91, R²: 0.85


In [None]:
from sklearn.tree import DecisionTreeRegressor

# Train the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict
y_val_dt = dt_model.predict(X_val)
y_test_dt = dt_model.predict(X_test)

# Evaluate
val_rmse_dt = mean_squared_error(y_val, y_val_dt)
val_r2_dt = r2_score(y_val, y_val_dt)

test_rmse_dt = mean_squared_error(y_test, y_test_dt)
test_r2_dt = r2_score(y_test, y_test_dt)

print(f"Decision Tree - Validation RMSE: {val_rmse_dt:.2f}, R²: {val_r2_dt:.2f}")
print(f"Decision Tree - Test RMSE: {test_rmse_dt:.2f}, R²: {test_r2_dt:.2f}")

Decision Tree - Validation RMSE: 312.05, R²: 0.47
Decision Tree - Test RMSE: 155.97, R²: 0.62


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_val_rf = rf_model.predict(X_val)
y_test_rf = rf_model.predict(X_test)

# Evaluate
val_rmse_rf = mean_squared_error(y_val, y_val_rf)
val_r2_rf = r2_score(y_val, y_val_rf)

test_rmse_rf = mean_squared_error(y_test, y_test_rf)
test_r2_rf = r2_score(y_test, y_test_rf)

print(f"Random Forest - Validation RMSE: {val_rmse_rf:.2f}, R²: {val_r2_rf:.2f}")
print(f"Random Forest - Test RMSE: {test_rmse_rf:.2f}, R²: {test_r2_rf:.2f}")

Random Forest - Validation RMSE: 129.84, R²: 0.78
Random Forest - Test RMSE: 74.03, R²: 0.82


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_val_xgb = xgb_model.predict(X_val)
y_test_xgb = xgb_model.predict(X_test)

# Evaluate
val_rmse_xgb = mean_squared_error(y_val, y_val_xgb)
val_r2_xgb = r2_score(y_val, y_val_xgb)

test_rmse_xgb = mean_squared_error(y_test, y_test_xgb)
test_r2_xgb = r2_score(y_test, y_test_xgb)

print(f"XGBoost - Validation RMSE: {val_rmse_xgb:.2f}, R²: {val_r2_xgb:.2f}")
print(f"XGBoost - Test RMSE: {test_rmse_xgb:.2f}, R²: {test_r2_xgb:.2f}")

XGBoost - Validation RMSE: 132.78, R²: 0.77
XGBoost - Test RMSE: 84.74, R²: 0.80


In [None]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 1.5/1.5 MB 6.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import lightgbm as lgb

# Train the model
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
lgb_model.fit(X_train, y_train)

# Predict
y_val_lgb = lgb_model.predict(X_val)
y_test_lgb = lgb_model.predict(X_test)

# Evaluate
val_rmse_lgb = mean_squared_error(y_val, y_val_lgb)
val_r2_lgb = r2_score(y_val, y_val_lgb)

test_rmse_lgb = mean_squared_error(y_test, y_test_lgb)
test_r2_lgb = r2_score(y_test, y_test_lgb)

print(f"LightGBM - Validation RMSE: {val_rmse_lgb:.2f}, R²: {val_r2_lgb:.2f}")
print(f"LightGBM - Test RMSE: {test_rmse_lgb:.2f}, R²: {test_r2_lgb:.2f}")

AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, r2_score

# Define the model
mlp_model = Sequential([
    Dense(256, activation='relu', input_shape=(17,)),
   
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
   
    Dense(1)  # Output layer for regression
])

# Compile the model
mlp_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the model
history = mlp_model.fit(X_train, y_train, epochs=200, batch_size=32,
                        validation_data=(X_val, y_val), verbose=0)

# Predict and evaluate
y_test_mlp = mlp_model.predict(X_test).flatten()
test_rmse = mean_squared_error(y_test, y_test_mlp)
test_r2 = r2_score(y_test, y_test_mlp)

print(f"MLP - Test RMSE: {test_rmse:.2f}, R²: {test_r2:.2f}")

MLP - Test RMSE: 125.03, R²: 0.70


i want to use tree based model without scaling

In [None]:
df_tree = pd.concat([df, df_], axis = 1)

In [74]:
df_tree

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter
0,7.93,12,1.0,0.0,1.000000e+00,6.123234e-17,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,16.42,20,2.0,1.0,1.224647e-16,-1.000000e+00,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,9.52,28,1.0,0.0,-1.000000e+00,-1.836970e-16,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7.44,5,1.0,1.0,1.000000e+00,6.123234e-17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,19.03,16,5.0,0.0,0.000000e+00,1.000000e+00,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,8.50,13,3.0,2.0,1.224647e-16,-1.000000e+00,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
996,16.28,8,9.0,0.0,0.000000e+00,1.000000e+00,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
997,15.62,26,2.0,2.0,1.224647e-16,-1.000000e+00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
998,14.17,8,0.0,0.0,1.000000e+00,6.123234e-17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [73]:
data_target

0      43
1      84
2      59
3      37
4      68
       ..
995    54
996    71
997    81
998    55
999    58
Name: Delivery_Time_min, Length: 1000, dtype: int64

EDA and Data Engineering for df_tree

In [76]:
data

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,7.93,Windy,Low,Afternoon,Scooter,12,1.0
1,16.42,Clear,Medium,Evening,Bike,20,2.0
2,9.52,Foggy,Low,Night,Scooter,28,1.0
3,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0
4,19.03,Clear,Low,Morning,Bike,16,5.0
...,...,...,...,...,...,...,...
995,8.50,Clear,High,Evening,Car,13,3.0
996,16.28,Rainy,Low,Morning,Scooter,8,9.0
997,15.62,Snowy,High,Evening,Scooter,26,2.0
998,14.17,Clear,Low,Afternoon,Bike,8,0.0


In [77]:
X = pd.concat([
    df_tree[["Distance_km", "Preparation_Time_min",	"Courier_Experience_yrs", "Traffic_encoded", "Time_sin", "Time_cos"]], data[["Weather", "Vehicle_Type"]]], 
    axis=1
)

In [78]:
X

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather,Vehicle_Type
0,7.93,12,1.0,0.0,1.000000e+00,6.123234e-17,Windy,Scooter
1,16.42,20,2.0,1.0,1.224647e-16,-1.000000e+00,Clear,Bike
2,9.52,28,1.0,0.0,-1.000000e+00,-1.836970e-16,Foggy,Scooter
3,7.44,5,1.0,1.0,1.000000e+00,6.123234e-17,Rainy,Scooter
4,19.03,16,5.0,0.0,0.000000e+00,1.000000e+00,Clear,Bike
...,...,...,...,...,...,...,...,...
995,8.50,13,3.0,2.0,1.224647e-16,-1.000000e+00,Clear,Car
996,16.28,8,9.0,0.0,0.000000e+00,1.000000e+00,Rainy,Scooter
997,15.62,26,2.0,2.0,1.224647e-16,-1.000000e+00,Snowy,Scooter
998,14.17,8,0.0,0.0,1.000000e+00,6.123234e-17,Clear,Bike


In [81]:
from sklearn.preprocessing import LabelEncoder

le_weather = LabelEncoder()
le_vehicle = LabelEncoder()

X["Weather"] = le_weather.fit_transform(X["Weather"])
X["Vehicle_Type"] = le_vehicle.fit_transform(X["Vehicle_Type"])


In [None]:

joblib.dump(le_vehicle, 'le_vehicle_final_project.pkl')

['le_vehicle_final_project.pkl']

In [84]:
joblib.dump(le_weather, 'le_weather_final_project.pkl')

['le_weather_final_project.pkl']

In [82]:
X

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Traffic_encoded,Time_sin,Time_cos,Weather,Vehicle_Type
0,7.93,12,1.0,0.0,1.000000e+00,6.123234e-17,4,2
1,16.42,20,2.0,1.0,1.224647e-16,-1.000000e+00,0,0
2,9.52,28,1.0,0.0,-1.000000e+00,-1.836970e-16,1,2
3,7.44,5,1.0,1.0,1.000000e+00,6.123234e-17,2,2
4,19.03,16,5.0,0.0,0.000000e+00,1.000000e+00,0,0
...,...,...,...,...,...,...,...,...
995,8.50,13,3.0,2.0,1.224647e-16,-1.000000e+00,0,1
996,16.28,8,9.0,0.0,0.000000e+00,1.000000e+00,2,2
997,15.62,26,2.0,2.0,1.224647e-16,-1.000000e+00,3,2
998,14.17,8,0.0,0.0,1.000000e+00,6.123234e-17,0,0


In [90]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor


def train_all_tree_models(df, data_target, save_path_prefix="best_model"):
    """
    Train Decision Tree, Random Forest, and XGBoost regressors with hyperparameter tuning.
    Selects the best model based on RMSE and saves it with joblib.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with features and target.
    target_col : str
        Name of the target column.
    save_path_prefix : str
        Prefix for saving models (e.g., "best_model").
    
    Returns:
    --------
    best_model : trained model with lowest RMSE
    results_summary : dict containing RMSE, R2, best_params for each model
    """
    
    X = df
    y = data_target
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    models = {
        "decision_tree": {
            "estimator": DecisionTreeRegressor(random_state=42),
            "param_grid": {
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            }
        },
        "random_forest": {
            "estimator": RandomForestRegressor(random_state=42),
            "param_grid": {
                "n_estimators": [100, 200, 300],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "max_features": ["auto", "sqrt", "log2"]
            }
        },
        "xgboost": {
            "estimator": XGBRegressor(random_state=42, tree_method="hist"),
            "param_grid": {
                "n_estimators": [200, 400],
                "max_depth": [4, 6, 8],
                "learning_rate": [0.05, 0.1],
                "subsample": [0.8, 1.0],
                "colsample_bytree": [0.8, 1.0]
            }
        }
    }
    
    results_summary = {}
    best_rmse = float("inf")
    best_model = None
    best_model_name = None
    
    for name, info in models.items():
        print(f"\nTraining {name}...")
        grid_search = GridSearchCV(
            estimator=info["estimator"],
            param_grid=info["param_grid"],
            cv=3,
            n_jobs=-1,
            scoring="neg_mean_squared_error",
            verbose=2
        )
        grid_search.fit(X_train, y_train)
        
        model_best = grid_search.best_estimator_
        y_pred = model_best.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        results_summary[name] = {
            "best_params": grid_search.best_params_,
            "RMSE": rmse,
            "R2": r2
        }
        
        print(f"{name} RMSE: {rmse:.4f}, R2: {r2:.4f}")
        
        # Save individual model
        joblib.dump(model_best, f"{save_path_prefix}_{name}.pkl")
        
        # Update best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model_best
            best_model_name = name
            
    print(f"\nBest Model: {best_model_name} with RMSE={best_rmse:.4f}")
    return best_model, results_summary


In [91]:
best_model, summary = train_all_tree_models(X, data_target, save_path_prefix="best_tree")
print(summary)


Training decision_tree...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
decision_tree RMSE: 11.9044, R2: 0.6838

Training random_forest...
Fitting 3 folds for each of 324 candidates, totalling 972 fits


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_paramet

random_forest RMSE: 9.5579, R2: 0.7962

Training xgboost...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
xgboost RMSE: 9.2248, R2: 0.8101

Best Model: xgboost with RMSE=9.2248
{'decision_tree': {'best_params': {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10}, 'RMSE': np.float64(11.90443113034535), 'R2': 0.6838309864391017}, 'random_forest': {'best_params': {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}, 'RMSE': np.float64(9.557947837995775), 'R2': 0.7961872623265298}, 'xgboost': {'best_params': {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}, 'RMSE': np.float64(9.224843225981964), 'R2': 0.8101458549499512}}
