In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [7]:
data = pd.read_csv('/content/Food_Delivery_Times.csv')

In [8]:
data.head()

Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,522,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,738,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,741,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,661,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,412,19.03,Clear,Low,Morning,Bike,16,5.0,68


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [10]:
data.drop('Order_ID',axis=1,inplace=True)

In [11]:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]

numerical_cols_with_missing = []
categorical_cols_with_missing = []

for col in missing_values.index:
    if data[col].dtype == 'object':
        categorical_cols_with_missing.append(col)
    else:
        numerical_cols_with_missing.append(col)

print("Numerical columns with missing values:", numerical_cols_with_missing)
print("Categorical columns with missing values:", categorical_cols_with_missing)

Numerical columns with missing values: ['Courier_Experience_yrs']
Categorical columns with missing values: ['Weather', 'Traffic_Level', 'Time_of_Day']


In [12]:
for col in categorical_cols_with_missing:
    data[col] = data[col].fillna(data[col].mode()[0])
print("Categorical columns imputed with mode:", categorical_cols_with_missing)

Categorical columns imputed with mode: ['Weather', 'Traffic_Level', 'Time_of_Day']


**Reasoning**:
With categorical columns already imputed, the next step is to impute the numerical columns. As per the task, numerical columns should be imputed using their median values, which is a robust measure against outliers.



In [13]:
for col in numerical_cols_with_missing:
    data[col] = data[col].fillna(data[col].median())
print("Numerical columns imputed with median:", numerical_cols_with_missing)

Numerical columns imputed with median: ['Courier_Experience_yrs']


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Weather                 1000 non-null   object 
 2   Traffic_Level           1000 non-null   object 
 3   Time_of_Day             1000 non-null   object 
 4   Vehicle_Type            1000 non-null   object 
 5   Preparation_Time_min    1000 non-null   int64  
 6   Courier_Experience_yrs  1000 non-null   float64
 7   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB


In [15]:
X = data.drop('Delivery_Time_min',axis=1)
y=data['Delivery_Time_min']

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
num_features = ['Distance_km','Preparation_Time_min','Courier_Experience_yrs']
cat_ord_features = ['Traffic_Level']
cat_nominal_features = ['Weather','Time_of_Day','Vehicle_Type']


preprocessor = ColumnTransformer(transformers=[
    ('num',StandardScaler(),num_features),
    ('ord',OrdinalEncoder(categories=[['Low','Medium','High']]),cat_ord_features),
    ('cat_nom',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),cat_nominal_features)
],remainder='passthrough')

In [18]:
from sklearn.tree import DecisionTreeRegressor
pipe = Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',DecisionTreeRegressor(random_state=42))
])

In [19]:
pipe.fit(X_train,y_train)

In [20]:
from sklearn.metrics import r2_score
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))

0.4067179914818826


In [28]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
pipe2 = Pipeline(steps=[
    ('preprocess',preprocessor),
    ('rf_model' ,RandomForestRegressor())
])

In [29]:
pipe2.fit(X_train,y_train)
y_pred_2 = pipe2.predict(X_test)
print(r2_score(y_test,y_pred_2))

0.7929526798336937


In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [24]:
param_dist = {
    'rf_model__n_estimators': randint(100, 500),
    'rf_model__max_features': ['sqrt', 'log2'], # Removed 'auto'
    'rf_model__max_depth': randint(10, 100),
    'rf_model__min_samples_split': randint(2, 20),
    'rf_model__min_samples_leaf': randint(1, 20)
}

In [None]:
random_search = RandomizedSearchCV(estimator=pipe2,
                                   param_distributions=param_dist,
                                   n_iter=100,  # Number of parameter settings that are sampled
                                   cv=5,       # 5-fold cross-validation
                                   verbose=2,  # Display progress
                                   random_state=42,
                                   n_jobs=-1)  # Use all available cores

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [31]:
print("Best parameters found:", random_search.best_params_)
print("Best R2 score found:", random_search.best_score_)

Best parameters found: {'rf_model__max_depth': 68, 'rf_model__max_features': 'sqrt', 'rf_model__min_samples_leaf': 2, 'rf_model__min_samples_split': 3, 'rf_model__n_estimators': 319}
Best R2 score found: 0.6951772614015951
