# Data Proprocessing

In [23]:
# Calling Librarries

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [2]:
df=pd.read_csv("taxi_trip_pricing.csv")

In [3]:
df.head(3)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [4]:
# Missing values

missing_columns=df.isnull().sum()
missing_columns=missing_columns[missing_columns>0]

if not missing_columns.empty:
    print((f"MISSING VALUES:\n{missing_columns}"))
else:
    print("No missining values")

MISSING VALUES:
Trip_Distance_km         50
Time_of_Day              50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Per_Km_Rate              50
Per_Minute_Rate          50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64


In [7]:
# Categorical and numerical columns

categorical_columns=df.select_dtypes(include=["object","category"]).columns.tolist()
print(f"CATEGORICAL COLUMNS:\n{categorical_columns}")
print(f"Total number of categorical columns:{len(categorical_columns)}")

numerical_columns=df.select_dtypes(include=["int64","float64"]).columns.tolist()
print(f"\nNUMERICAL COLUMNS:\n{numerical_columns}")
print(f"Total nymber of numerical columns:{len(numerical_columns)}")

CATEGORICAL COLUMNS:
['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather']
Total number of categorical columns:4

NUMERICAL COLUMNS:
['Trip_Distance_km', 'Passenger_Count', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes', 'Trip_Price']
Total nymber of numerical columns:7


# Handling with Missing values

In [5]:
missing_columns=df.columns[df.isnull().sum()>0]

for col in missing_columns:
    if df[col].dtype=="object":
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [17]:
df.head(10)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,3.502989,0.62,0.43,40.57,56.874773
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,Clear,3.48,0.51,0.15,116.81,36.4698
4,27.070547,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
6,3.85,Afternoon,Weekday,4.0,High,Rain,3.51,1.66,0.292916,5.05,11.2645
7,43.44,Evening,Weekend,3.0,Low,Clear,2.97,1.87,0.23,62.118116,101.1216
8,30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,56.874773
9,35.7,Afternoon,Weekday,2.0,Low,Rain,3.39,1.52,0.47,62.118116,75.5657


In [19]:
# Missing values

missing_columns=df.isnull().sum()
missing_columns=missing_columns[missing_columns>0]

if not missing_columns.empty:
    print((f"MISSING VALUES:\n{missing_columns}"))
else:
    print("No missining values")

No missining values


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Time_of_Day            1000 non-null   object 
 2   Day_of_Week            1000 non-null   object 
 3   Passenger_Count        1000 non-null   float64
 4   Traffic_Conditions     1000 non-null   object 
 5   Weather                1000 non-null   object 
 6   Base_Fare              1000 non-null   float64
 7   Per_Km_Rate            1000 non-null   float64
 8   Per_Minute_Rate        1000 non-null   float64
 9   Trip_Duration_Minutes  1000 non-null   float64
 10  Trip_Price             1000 non-null   float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


# Encoding

In [6]:
# Categorical  columns

categorical_columns=df.select_dtypes(include=["object","category"]).columns.tolist()
print(f"CATEGORICAL COLUMNS:\n{categorical_columns}")
print(f"\nTotal number of categorical columns:{len(categorical_columns)}")

CATEGORICAL COLUMNS:
['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather']

Total number of categorical columns:4


In [7]:
# Finding cardinality

cardinality=df[categorical_columns].nunique()
print(cardinality)

Time_of_Day           4
Day_of_Week           2
Traffic_Conditions    3
Weather               3
dtype: int64


In [11]:
# Encoding One_hot encoding
# get_dummies

one_hot_encoding=pd.get_dummies(df,columns=categorical_columns,drop_first=True)
one_hot_encoding.head(10)

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,19.35,3.0,3.56,0.8,0.32,53.82,36.2624,False,True,False,False,True,False,False,False
1,47.59,1.0,3.502989,0.62,0.43,40.57,56.874773,False,False,False,False,False,False,False,False
2,36.87,1.0,2.7,1.21,0.15,37.27,52.9032,True,False,False,True,False,False,False,False
3,30.33,4.0,3.48,0.51,0.15,116.81,36.4698,True,False,False,False,True,False,False,False
4,27.070547,3.0,2.93,0.63,0.32,22.64,15.618,True,False,False,False,False,False,False,False
5,8.64,2.0,2.55,1.71,0.48,89.33,60.2028,False,False,False,True,False,True,False,False
6,3.85,4.0,3.51,1.66,0.292916,5.05,11.2645,False,False,False,False,False,False,True,False
7,43.44,3.0,2.97,1.87,0.23,62.118116,101.1216,True,False,False,True,True,False,False,False
8,30.45,3.0,2.77,1.78,0.34,110.33,56.874773,False,True,False,False,False,False,False,False
9,35.7,2.0,3.39,1.52,0.47,62.118116,75.5657,False,False,False,False,True,False,True,False


In [12]:
df=one_hot_encoding

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           1000 non-null   float64
 1   Passenger_Count            1000 non-null   float64
 2   Base_Fare                  1000 non-null   float64
 3   Per_Km_Rate                1000 non-null   float64
 4   Per_Minute_Rate            1000 non-null   float64
 5   Trip_Duration_Minutes      1000 non-null   float64
 6   Trip_Price                 1000 non-null   float64
 7   Time_of_Day_Evening        1000 non-null   bool   
 8   Time_of_Day_Morning        1000 non-null   bool   
 9   Time_of_Day_Night          1000 non-null   bool   
 10  Day_of_Week_Weekend        1000 non-null   bool   
 11  Traffic_Conditions_Low     1000 non-null   bool   
 12  Traffic_Conditions_Medium  1000 non-null   bool   
 13  Weather_Rain               1000 non-null   bool  

In [14]:
df[df.columns]=df[df.columns].astype(int) # Qo'shilgan yangi ustunlarni booleandan int ga o'tkazish

In [15]:
df.head(1)

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,19,3,3,0,0,53,36,0,1,0,0,1,0,0,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Trip_Distance_km           1000 non-null   int64
 1   Passenger_Count            1000 non-null   int64
 2   Base_Fare                  1000 non-null   int64
 3   Per_Km_Rate                1000 non-null   int64
 4   Per_Minute_Rate            1000 non-null   int64
 5   Trip_Duration_Minutes      1000 non-null   int64
 6   Trip_Price                 1000 non-null   int64
 7   Time_of_Day_Evening        1000 non-null   int64
 8   Time_of_Day_Morning        1000 non-null   int64
 9   Time_of_Day_Night          1000 non-null   int64
 10  Day_of_Week_Weekend        1000 non-null   int64
 11  Traffic_Conditions_Low     1000 non-null   int64
 12  Traffic_Conditions_Medium  1000 non-null   int64
 13  Weather_Rain               1000 non-null   int64
 14  Weather_Snow             

# Scaling


In [17]:
# MinMax scaling

min_max_scaler=MinMaxScaler()
df_min_max_scaled=pd.DataFrame(min_max_scaler.fit_transform(df),columns=df.columns)

df_min_max_scaled

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,0.124138,0.666667,0.333333,0.0,0.0,0.421053,0.092025,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.317241,0.000000,0.333333,0.0,0.0,0.307018,0.153374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.241379,0.000000,0.000000,0.5,0.0,0.280702,0.141104,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.200000,1.000000,0.333333,0.0,0.0,0.973684,0.092025,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.179310,0.666667,0.000000,0.0,0.0,0.149123,0.027607,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.027586,1.000000,0.000000,0.0,0.0,0.464912,0.085890,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
996,0.303448,1.000000,0.333333,0.0,0.0,0.491228,0.171779,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
997,0.041379,0.666667,0.000000,0.5,0.0,0.429825,0.082822,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
998,0.317241,0.000000,0.000000,0.0,0.0,0.956140,0.168712,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [35]:
df=df_min_max_scaled

# Train processing

In [39]:
x=df.drop("Trip_Price",axis=1) # Input
y=df["Trip_Price"]# Target variable

In [40]:
x.head(1)

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,0.124138,0.666667,0.333333,0.0,0.0,0.421053,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [44]:
print(f"Target variable:\n{y.head()}")

Target variable:
0    0.092025
1    0.153374
2    0.141104
3    0.092025
4    0.027607
Name: Trip_Price, dtype: float64


In [50]:
# data splitting

x_train, x_temp, y_train, y_temp=train_test_split(x,y, test_size=0.3, random_state=42) # 70% data training uchun

x_test, x_val, y_test, y_val=train_test_split(x_temp,y_temp, test_size=0.5, random_state=42) # 15% data validate uchun 15% i test uchun

In [51]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

print(y_test.shape)
print(y_val.shape)

(700, 14)
(150, 14)
(150, 14)
(150,)
(150,)


In [70]:

model=LinearRegression() # selecting model

model.fit(x_train,y_train) # teaching model with 70% data


In [71]:
y.head() # old features

0    0.092025
1    0.153374
2    0.141104
3    0.092025
4    0.027607
Name: Trip_Price, dtype: float64

In [73]:
y_pred=model.predict(x_test)# predicting 
print(f"Predict columns:\n{y_pred[:5]}")

Predict columns:
[0.15304695 0.16625021 0.03845941 0.05057584 0.10908159]


# Model Evoluation

In [74]:
mse=mean_squared_error(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

print(f"Mean Squared Error (MSE):\n{mse:.2f}")
print(f"Mean Absolute Error (MAE):\n{mae:.2f}")
print(f"R2 Score (R2):\n{r2:.2f}")

Mean Squared Error (MSE):
0.00
Mean Absolute Error (MAE):
0.04
R2 Score (R2):
0.59


In [75]:
# coss-val-score, Kfold

Kfold=KFold(n_splits=5, shuffle=True,random_state=42)



In [80]:
cv_scores_lr = cross_val_score(model, x_train, y_train, cv=Kfold, scoring='r2')

cv_scores_lr_mean = np.mean(cv_scores_lr)
print(f"Linear Regression Cross-Validation R²: {cv_scores_lr_mean:.2f}")

Linear Regression Cross-Validation R²: 0.75


In [81]:
r2=r2_score(y_test,y_pred)
print(f"R2 Score (R2):\n{r2:.2f}")

R2 Score (R2):
0.59
