# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/mlproject

/content/drive/MyDrive/mlproject


# Load Data

In [None]:
data = pd.read_csv("transformed_data.csv")

In [None]:
data.head()

Unnamed: 0,rider_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,weather,traffic,vehicle_condition,...,time_taken,city_code,distance(km),order_day,order_month,order_day_of_week,is_weekend,pickup_time_minutes,order_hour,order_period
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,sunny,high,2,...,24,INDO,3.03,19,3,saturday,1,15.0,11.0,morning
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,stormy,jam,2,...,33,BANG,20.18,25,3,friday,0,5.0,19.0,evening
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,sandstorms,low,0,...,26,BANG,1.55,19,3,saturday,1,15.0,8.0,morning
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,sunny,medium,0,...,21,COIMB,7.79,5,4,tuesday,0,10.0,18.0,evening
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,cloudy,high,1,...,30,CHEN,6.21,26,3,saturday,1,15.0,13.0,afternoon


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45502 entries, 0 to 45501
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   rider_id              45502 non-null  object 
 1   age                   43648 non-null  float64
 2   ratings               43594 non-null  float64
 3   restaurant_latitude   41872 non-null  float64
 4   restaurant_longitude  41872 non-null  float64
 5   delivery_latitude     41872 non-null  float64
 6   delivery_longitude    41872 non-null  float64
 7   weather               44977 non-null  object 
 8   traffic               44992 non-null  object 
 9   vehicle_condition     45502 non-null  int64  
 10  type_of_order         45502 non-null  object 
 11  type_of_vehicle       45502 non-null  object 
 12  multiple_deliveries   44509 non-null  float64
 13  festival              45274 non-null  object 
 14  city                  44304 non-null  object 
 15  time_taken         

# Data Understanding

In [None]:
def preprocess_data(data: pd.DataFrame):
  return (
      data.drop(columns = ["rider_id","restaurant_latitude","restaurant_longitude","delivery_latitude","delivery_longitude","city_code","order_day","order_month","order_day_of_week","order_hour"])
  )

In [None]:
preprocess_data(data).head()

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken,distance(km),is_weekend,pickup_time_minutes,order_period
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,3.03,1,15.0,morning
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,20.18,0,5.0,evening
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1.55,1,15.0,morning
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,7.79,0,10.0,evening
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,6.21,1,15.0,afternoon


In [None]:
preprocessed_data = data.pipe(
    preprocess_data
)

In [None]:
preprocessed_data.to_csv("preprocessed_data.csv",index=False)

In [None]:
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45502 entries, 0 to 45501
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  43648 non-null  float64
 1   ratings              43594 non-null  float64
 2   weather              44977 non-null  object 
 3   traffic              44992 non-null  object 
 4   vehicle_condition    45502 non-null  int64  
 5   type_of_order        45502 non-null  object 
 6   type_of_vehicle      45502 non-null  object 
 7   multiple_deliveries  44509 non-null  float64
 8   festival             45274 non-null  object 
 9   city                 44304 non-null  object 
 10  time_taken           45502 non-null  int64  
 11  distance(km)         41872 non-null  float64
 12  is_weekend           45502 non-null  int64  
 13  pickup_time_minutes  43031 non-null  float64
 14  order_period         45502 non-null  object 
dtypes: float64(5), int64(3), object(7)
m

In [None]:
df = preprocessed_data

In [None]:
df.head()

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken,distance(km),is_weekend,pickup_time_minutes,order_period
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,3.03,1,15.0,morning
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,20.18,0,5.0,evening
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1.55,1,15.0,morning
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,7.79,0,10.0,evening
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,6.21,1,15.0,afternoon


In [None]:
for column in df.columns:
    missing_percentage = (df[column].isnull().sum() / len(df)) * 100

    # Use round() with 2 decimal places
    rounded_percentage = round(missing_percentage, 2)

    print(f"{column} contains {rounded_percentage}% of missing values.")

age contains 4.07% of missing values.
ratings contains 4.19% of missing values.
weather contains 1.15% of missing values.
traffic contains 1.12% of missing values.
vehicle_condition contains 0.0% of missing values.
type_of_order contains 0.0% of missing values.
type_of_vehicle contains 0.0% of missing values.
multiple_deliveries contains 2.18% of missing values.
festival contains 0.5% of missing values.
city contains 2.63% of missing values.
time_taken contains 0.0% of missing values.
distance(km) contains 7.98% of missing values.
is_weekend contains 0.0% of missing values.
pickup_time_minutes contains 5.43% of missing values.
order_period contains 0.0% of missing values.


**Split of data**

In [None]:
X = df.drop("time_taken",axis = 1)
y = df["time_taken"]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [None]:
x_train.isnull().sum()

Unnamed: 0,0
age,1470
ratings,1510
weather,421
traffic,407
vehicle_condition,0
type_of_order,0
type_of_vehicle,0
multiple_deliveries,795
festival,188
city,968


**Transforming target feature**

In [None]:
pt = PowerTransformer()

y_train = pt.fit_transform(y_train.values.reshape(-1,1))
y_test = pt.transform(y_test.values.reshape(-1,1))

In [None]:
y_train

array([[-0.73874627],
       [-0.6099879 ],
       [ 0.39299837],
       ...,
       [ 0.77215502],
       [ 0.39299837],
       [ 0.49090731]])

In [None]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36401 entries, 11029 to 15795
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  34931 non-null  float64
 1   ratings              34891 non-null  float64
 2   weather              35980 non-null  object 
 3   traffic              35994 non-null  object 
 4   vehicle_condition    36401 non-null  int64  
 5   type_of_order        36401 non-null  object 
 6   type_of_vehicle      36401 non-null  object 
 7   multiple_deliveries  35606 non-null  float64
 8   festival             36213 non-null  object 
 9   city                 35433 non-null  object 
 10  distance(km)         33470 non-null  float64
 11  is_weekend           36401 non-null  int64  
 12  pickup_time_minutes  34432 non-null  float64
 13  order_period         36401 non-null  object 
dtypes: float64(5), int64(2), object(7)
memory usage: 4.2+ MB


# Imputation

In [None]:
categorical = ["weather","traffic","multiple_deliveries","festival","city"]
numerical = ["age","ratings","distance(km)","pickup_time_minutes"]

median = ["age","ratings"]
knn = ["distance(km)","pickup_time_minutes"]

nominal_cat_cols = [
    'weather', 'type_of_order', 'type_of_vehicle',
    'festival', 'city', 'order_period'
]

ordinal_cat_cols = ["traffic"]

# Define order for ordinal encoding
traffic_order = ["low", "medium", "high", "jam"]

Mode Imputation

In [None]:
### features to impute with mode

mode_imputer = ColumnTransformer(
    transformers=[
        ("mode_imputer", SimpleImputer(strategy="most_frequent"), categorical)
    ],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False
)

In [None]:
mode_imputer

In [None]:
pd.DataFrame(
    mode_imputer.fit_transform(x_train),
    columns=mode_imputer.get_feature_names_out()).isnull().sum()

Unnamed: 0,0
weather,0
traffic,0
multiple_deliveries,0
festival,0
city,0
age,1470
ratings,1510
vehicle_condition,0
type_of_order,0
type_of_vehicle,0


Median Imputation

In [None]:
## features to fill with median

median_imputer = ColumnTransformer(
    transformers=[
        ("median_imputer", SimpleImputer(strategy="median"), median)
    ],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False
)

In [None]:
pd.DataFrame(
    median_imputer.fit_transform(x_train),
    columns=median_imputer.get_feature_names_out()).isnull().sum()

Unnamed: 0,0
age,0
ratings,0
weather,421
traffic,407
vehicle_condition,0
type_of_order,0
type_of_vehicle,0
multiple_deliveries,795
festival,188
city,968


KNN Imputation

In [None]:
## features to impute using knn

knn_imputer = ColumnTransformer(
    transformers=[
        ("knn_imputer", KNNImputer(n_neighbors=5),knn)
    ],remainder = "passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False
)

In [None]:
pd.DataFrame(
    knn_imputer.fit_transform(x_train),
    columns=knn_imputer.get_feature_names_out()).isnull().sum()

Unnamed: 0,0
distance(km),0
pickup_time_minutes,0
age,1470
ratings,1510
weather,421
traffic,407
vehicle_condition,0
type_of_order,0
type_of_vehicle,0
multiple_deliveries,795


Scaling & Encoding

In [None]:
encoder = ColumnTransformer(
    transformers=[
        # Apply One-Hot Encoding to nominal (non-ordered) categorical columns
        ("nominal_encode",
         OneHotEncoder(drop="first", handle_unknown="ignore"),
         nominal_cat_cols),

        # Apply Ordinal Encoding to ordinal (ordered) categorical columns
        ("ordinal_encode",
         OrdinalEncoder(
             categories=[traffic_order],
             handle_unknown="use_encoded_value",
             unknown_value=-1
         ),
         ordinal_cat_cols)
    ],
    remainder="passthrough", # Keep numerical features and other columns (like target) untouched
    n_jobs=-1,
    verbose_feature_names_out=False
)

In [None]:
scaler = ColumnTransformer(
    transformers=[
        # Apply MinMaxScaler to the numerical features
        ("scale", MinMaxScaler(), numerical)
    ],
    remainder="passthrough", # Keep the encoded categorical features and any other remaining columns untouched
    n_jobs=-1,
    verbose_feature_names_out=False
)

# Imputation Pipeline

**Training pipeline to train and test data**

In [None]:
numerical_median_impute_scale = ['age', 'ratings']
numerical_knn_impute_scale = ['distance(km)', 'pickup_time_minutes']
ordinal_impute_encode = ['traffic']
nominal_impute_encode = ['weather', 'multiple_deliveries', 'festival', 'city']
nominal_encode_only = ['type_of_order', 'type_of_vehicle', 'order_period']
passthrough_columns = ['vehicle_condition', 'is_weekend']

# Pipelines for different column types
numerical_median_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

numerical_knn_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', MinMaxScaler())
])

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[traffic_order], handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_impute_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

nominal_encode_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num_median', numerical_median_pipeline, numerical_median_impute_scale),
        ('num_knn', numerical_knn_pipeline, numerical_knn_impute_scale),
        ('ord_cat', ordinal_pipeline, ordinal_impute_encode),
        ('nom_impute_cat', nominal_impute_pipeline, nominal_impute_encode),
        ('nom_cat', nominal_encode_pipeline, nominal_encode_only),
        ('pass', 'passthrough', passthrough_columns)
    ],
    n_jobs=-1,
    verbose_feature_names_out=False
)

**Apply the final preprocessing pipeline**

In [None]:
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

In [None]:
np.isnan(x_train_processed).sum()

np.int64(0)

In [None]:
x_train_processed

array([[0.31578947, 0.88      , 0.39538462, ..., 1.        , 2.        ,
        0.        ],
       [0.05263158, 0.96      , 0.23641026, ..., 0.        , 2.        ,
        0.        ],
       [0.52631579, 0.88      , 0.00410256, ..., 0.        , 2.        ,
        1.        ],
       ...,
       [0.84210526, 0.76      , 0.44717949, ..., 0.        , 1.        ,
        0.        ],
       [0.26315789, 0.96      , 0.3225641 , ..., 0.        , 2.        ,
        1.        ],
       [0.47368421, 0.92      , 0.32051282, ..., 1.        , 2.        ,
        0.        ]])

In [None]:
np.isnan(x_test_processed).sum()

np.int64(0)

In [None]:
import joblib
joblib.dump(preprocessor, "preprocessor.joblib")
joblib.dump(
    preprocessor.get_feature_names_out(),
    "preprocessor_feature_names.joblib"
)


['preprocessor_feature_names.joblib']