In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("taxi_trip_pricing.csv")
df

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.80,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.70,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.6180
...,...,...,...,...,...,...,...,...,...,...,...
995,5.49,Afternoon,Weekend,4.0,Medium,Clear,2.39,0.62,0.49,58.39,34.4049
996,45.95,Night,Weekday,4.0,Medium,Clear,3.12,0.61,,61.96,62.1295
997,7.70,Morning,Weekday,3.0,Low,Rain,2.08,1.78,,54.18,33.1236
998,47.56,Morning,Weekday,1.0,Low,Clear,2.67,0.82,0.17,114.94,61.2090


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


### Labels to convert into Integers 
#### --> Time_of_Day ('Morning' - 0, 'Afternoon' - 1, 'Evening' - 2, 'Night' - 3)
#### --> Day_of_Week ('Weekday' - 0, 'Weekend' - 1)
#### --> Traffic_Conditions ('Low' - 0, 'High' - 1, 'Medium' - 2)
#### --> Weather ('Clear' - 0, 'Rain' - 1, 'Snow' - 2)

In [4]:
df.describe()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
count,950.0,950.0,950.0,950.0,950.0,950.0,951.0
mean,27.070547,2.476842,3.502989,1.233316,0.292916,62.118116,56.874773
std,19.9053,1.102249,0.870162,0.429816,0.115592,32.154406,40.469791
min,1.23,1.0,2.01,0.5,0.1,5.01,6.1269
25%,12.6325,1.25,2.73,0.86,0.19,35.8825,33.74265
50%,25.83,2.0,3.52,1.22,0.29,61.86,50.0745
75%,38.405,3.0,4.26,1.61,0.39,89.055,69.09935
max,146.067047,4.0,5.0,2.0,0.5,119.84,332.043689


In [5]:
num_duplicates = df.duplicated().sum()
print(f"Count of duplicates: {num_duplicates}")

Count of duplicates: 0


In [6]:
df['Trip_Distance_km'].fillna(df['Trip_Distance_km'].median(), inplace=True)
df['Passenger_Count'].fillna(df['Passenger_Count'].median(), inplace=True)
df['Trip_Price'].fillna(df['Trip_Price'].median(), inplace=True)

In [7]:
df['Time_of_Day'] = df['Time_of_Day'].fillna(df['Time_of_Day'].mode()[0])
df['Day_of_Week'] = df['Day_of_Week'].fillna(df['Day_of_Week'].mode()[0])
df['Traffic_Conditions'] = df['Traffic_Conditions'].fillna(df['Traffic_Conditions'].mode()[0])
df['Weather'] = df['Weather'].fillna(df['Weather'].mode()[0])

In [8]:
label_encoder = LabelEncoder()

In [9]:
df["Time_of_Day"] = label_encoder.fit_transform(df["Time_of_Day"])
df["Day_of_Week"] = label_encoder.fit_transform(df["Day_of_Week"])
df["Traffic_Conditions"] = label_encoder.fit_transform(df["Traffic_Conditions"])
df["Weather"] = label_encoder.fit_transform(df["Weather"])

In [10]:
df = df.drop(labels= ["Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"],axis=1)

In [11]:
X = df.drop(['Trip_Price'], axis = 1)
y = df['Trip_Price']

In [12]:
feature_names = X.columns

In [13]:
y

0      36.2624
1      50.0745
2      52.9032
3      36.4698
4      15.6180
        ...   
995    34.4049
996    62.1295
997    33.1236
998    61.2090
999    45.4437
Name: Trip_Price, Length: 1000, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=100) 

In [15]:
regression_model = LinearRegression()

In [16]:
X

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather
0,19.35,2,0,3.0,1,0
1,47.59,0,0,1.0,0,0
2,36.87,1,1,1.0,0,0
3,30.33,1,0,4.0,1,0
4,25.83,1,0,3.0,0,0
...,...,...,...,...,...,...
995,5.49,0,1,4.0,2,0
996,45.95,3,0,4.0,2,0
997,7.70,2,0,3.0,1,1
998,47.56,2,0,1.0,1,0


In [17]:
model = regression_model.fit(X_train, y_train)

In [18]:
model.intercept_

9.147383230200226

In [19]:
model.coef_

array([ 1.68242578,  1.25752874, -2.46459491,  1.28063718, -1.55741552,
       -0.77459797])

In [20]:
model.feature_names_in_

array(['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week',
       'Passenger_Count', 'Traffic_Conditions', 'Weather'], dtype=object)

In [21]:
model.predict([[10, 2,1, 2, 2, 1]])



array([24.69394896])

In [22]:
joblib.dump(model, "linearRegression.pkl")

['linearRegression.pkl']

In [23]:
model_1 = joblib.load("linearRegression.pkl")

In [24]:
model_1.predict([[10, 2,1, 2, 2, 1]])



array([24.69394896])