In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_excel("Taxi_cleaned.xlsx")
df

Unnamed: 0,Trip_Distance_km,Passenger_Count,Per_Km_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,5.279810,3,0.800000,53.82,36.2624,0,1,0,0,1,0,0,0
1,7.378374,1,1.210000,37.27,52.9032,1,0,0,1,0,0,0,0
2,6.690839,4,0.510000,116.81,36.4698,1,0,0,0,1,0,0,0
3,3.253939,2,1.710000,89.33,60.2028,0,0,0,1,0,1,0,0
4,1.733422,4,1.660000,5.05,11.2645,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,2.347067,4,0.620000,58.39,34.4049,0,0,0,1,0,1,0,0
854,8.213556,4,0.610000,61.96,62.1295,0,0,1,0,0,1,0,0
855,3.009442,3,1.780000,54.18,33.1236,0,1,0,0,1,0,1,0
856,8.350267,1,0.820000,114.94,61.2090,0,1,0,0,1,0,0,0


In [3]:
X = df.drop("Trip_Price",axis = 1)
y = df["Trip_Price"]

- Select the X & y

### Train-test split

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=54)

### Modelling

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

print("Intercept",model.intercept_)
print("Coefficient",model.coef_)

Intercept -60.28115623212574
Coefficient [12.64736968 -0.98792375 27.10868716  0.27989231 -1.37391786  0.2859285
 -2.12818162 -0.31921192 -3.87236056 -4.221999    0.73276762  3.1312817 ]


### Prediction & Evaluation

In [6]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

print("Train r2:",r2_score(y_train,ypred_train))
print("Test R2:",r2_score(y_test,ypred_test))
print("Cross validation score:",cross_val_score(model,X_train,y_train,cv=5).mean())

Train r2: 0.631875821559633
Test R2: 0.6210014107369882
Cross validation score: 0.6151228779206688


### Variable Significance

In [7]:
import statsmodels.formula.api as smf
model = smf.ols("y~X",data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.631
Model:,OLS,Adj. R-squared:,0.625
Method:,Least Squares,F-statistic:,120.3
Date:,"Tue, 12 Aug 2025",Prob (F-statistic):,1.8700000000000002e-173
Time:,04:10:40,Log-Likelihood:,-3993.8
No. Observations:,858,AIC:,8014.0
Df Residuals:,845,BIC:,8075.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-60.5344,4.911,-12.326,0.000,-70.174,-50.895
X[0],12.5189,0.368,34.014,0.000,11.796,13.241
X[1],-0.4538,0.817,-0.556,0.579,-2.057,1.149
X[2],26.5839,2.107,12.618,0.000,22.449,30.719
X[3],0.2960,0.027,10.877,0.000,0.243,0.349
X[4],-1.9553,2.376,-0.823,0.411,-6.619,2.708
X[5],-0.7263,2.148,-0.338,0.735,-4.942,3.489
X[6],-2.5392,3.193,-0.795,0.427,-8.806,3.728
X[7],-0.3081,1.930,-0.160,0.873,-4.096,3.480

0,1,2,3
Omnibus:,732.8,Durbin-Watson:,1.954
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18614.349
Skew:,3.844,Prob(JB):,0.0
Kurtosis:,24.484,Cond. No.,427.0


### Reapplying the algorithm after remove the features

In [8]:
X = df.drop(["Trip_Price","Passenger_Count","Day_of_Week_Weekend","Time_of_Day_Night","Time_of_Day_Morning","Weather_Rain","Traffic_Conditions_Low","Traffic_Conditions_Medium","Time_of_Day_Evening","Weather_Snow"],axis = 1)
y = df["Trip_Price"]

#Train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=54)

#Modelling
from sklearn.linear_model import LinearRegression
l_model = LinearRegression()
l_model.fit(X_train,y_train)

print("Intercept",l_model.intercept_)
print("Coefficient",l_model.coef_)

#Prediction & Evaluation
ypred_train = l_model.predict(X_train)
from sklearn.metrics import r2_score
print("Train r2:",r2_score(y_train,ypred_train))

#Prediction on test data
ypred_test = l_model.predict(X_test)
print("Test R2:",r2_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("Cross validation score:",cross_val_score(l_model,X_train,y_train,cv=5).mean())

Intercept -66.5297797411119
Coefficient [12.72395992 26.95878352  0.28140925]
Train r2: 0.6293989170651184
Test R2: 0.6227142822908913
Cross validation score: 0.6222152578736413
