In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import numpy as np
import pandas as pd

In [4]:
df=pd.read_csv("delhi_cab_fare_mock_dataset.csv")

In [6]:
df.head(5)

Unnamed: 0,pickup_location,drop_location,distance_km,duration_min,hour,dayofweek,is_weekend,temperature,weather_condition,fare_amount
0,Saket,Rajouri Garden,6.56,17.2,13,0,0,32.2,Clouds,135.92
1,Connaught Place,Chandni Chowk,19.53,65.7,1,6,1,41.2,Clouds,340.06
2,Pitampura,Saket,18.92,69.4,17,3,0,33.4,Clouds,386.91
3,Dwarka,Saket,17.51,52.3,8,0,0,20.7,Fog,347.78
4,Connaught Place,Chandni Chowk,23.9,86.6,8,4,0,30.5,Thunderstorm,570.49


In [10]:
df.describe()


Unnamed: 0,distance_km,duration_min,hour,dayofweek,is_weekend,temperature,fare_amount
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,14.10734,42.369,11.658,3.09,0.324,30.2414,278.88726
std,6.352196,21.207491,6.630608,2.038188,0.468469,6.976765,111.582978
min,3.0,6.3,0.0,0.0,0.0,18.0,84.34
25%,8.615,25.15,6.0,1.0,0.0,23.9,189.97
50%,13.98,40.1,12.0,3.0,0.0,30.8,273.34
75%,19.8075,57.1,17.0,5.0,1.0,36.3,363.955
max,24.99,99.6,23.0,6.0,1.0,41.9,581.89


In [18]:
df.isnull().sum()

pickup_location      0
drop_location        0
distance_km          0
duration_min         0
hour                 0
dayofweek            0
is_weekend           0
temperature          0
weather_condition    0
fare_amount          0
dtype: int64

In [32]:
df['weather_condition'].unique()

array(['Clouds', 'Fog', 'Thunderstorm', 'Clear', 'Haze', 'Rain'],
      dtype=object)

In [8]:
X=df.drop(columns=['pickup_location','drop_location','fare_amount'])
y=df['fare_amount']

In [36]:
preprocessor = ColumnTransformer([("weather", OneHotEncoder(), ["weather_condition"])], remainder='passthrough')

In [38]:
model = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [42]:
joblib.dump(model, "cab_fare_model.pkl")

['cab_fare_model.pkl']