# Data preprocessing

In [50]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
#df=pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Project9/taxi_trip_pricing.csv.xlsx')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project9/taxi_trip_pricing11.csv')


df.head(1)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35km,Morning,Weekday,3,Low,Clear,3.56,0.8,0.32,53.82,$36.26


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    object 
 1   Time_of_Day            948 non-null    object 
 2   Day_of_Week            949 non-null    object 
 3   Passenger_Count        948 non-null    object 
 4   Traffic_Conditions     949 non-null    object 
 5   Weather                949 non-null    object 
 6   Base_Fare              949 non-null    float64
 7   Per_Km_Rate            948 non-null    float64
 8   Per_Minute_Rate        946 non-null    float64
 9   Trip_Duration_Minutes  947 non-null    float64
 10  Trip_Price             952 non-null    object 
dtypes: float64(4), object(7)
memory usage: 86.1+ KB


In [4]:
df['Trip_Distance_km'].value_counts()

Unnamed: 0_level_0,count
Trip_Distance_km,Unnamed: 1_level_1
10.14,3
13.64,3
32.04,3
31.29,3
3.22,3
...,...
32.12,1
19.36,1
31.67,1
25.65,1


In [5]:
df['Passenger_Count'].unique()

array(['3', '1', 'four', '2', '4', 'two', nan], dtype=object)

In [6]:
df['Passenger_Count'] = df['Passenger_Count'].replace('four', '4')
df['Passenger_Count'] = df['Passenger_Count'].replace('two', '2')
df['Passenger_Count'].fillna(df['Passenger_Count'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Passenger_Count'].fillna(df['Passenger_Count'].mode()[0], inplace=True)


In [7]:
df['Passenger_Count'].unique()

array(['3', '1', '4', '2'], dtype=object)

In [21]:
df['Passenger_Count'] = df['Passenger_Count'].astype(int)

In [22]:
df['Passenger_Count'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1000 entries, 0 to 999
Series name: Passenger_Count
Non-Null Count  Dtype
--------------  -----
1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB


In [23]:
df.isnull().sum()

Unnamed: 0,0
Trip_Distance_km,50
Time_of_Day,52
Day_of_Week,51
Passenger_Count,0
Traffic_Conditions,51
Weather,51
Base_Fare,51
Per_Km_Rate,52
Per_Minute_Rate,54
Trip_Duration_Minutes,53


In [24]:
df['Trip_Price'].value_counts()

Unnamed: 0_level_0,count
Trip_Price,Unnamed: 1_level_1
$36.26,1
9.9494,1
65.195,1
85.4652,1
69.9148,1
...,...
12.9998,1
46.7471,1
45.8975,1
58.5727,1


In [25]:
missing_columns = df.columns[df.isnull().sum() > 0]

for col in missing_columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   object 
 1   Time_of_Day            1000 non-null   object 
 2   Day_of_Week            1000 non-null   object 
 3   Passenger_Count        1000 non-null   int64  
 4   Traffic_Conditions     1000 non-null   object 
 5   Weather                1000 non-null   object 
 6   Base_Fare              1000 non-null   float64
 7   Per_Km_Rate            1000 non-null   float64
 8   Per_Minute_Rate        1000 non-null   float64
 9   Trip_Duration_Minutes  1000 non-null   float64
 10  Trip_Price             1000 non-null   object 
dtypes: float64(4), int64(1), object(6)
memory usage: 86.1+ KB


In [27]:
problem_columns = df.select_dtypes(include=['object']).columns
print("Object columns to investigate:", problem_columns)

Object columns to investigate: Index(['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week', 'Traffic_Conditions',
       'Weather', 'Trip_Price'],
      dtype='object')


In [30]:
columns_to_clean = ['Trip_Distance_km', 'Trip_Price']

def clean_numeric_column(column):

    if column.dtype != 'object':
        column = column.astype(str)
    return (
        column.str.replace(r'[^\d.]', '', regex=True).replace('', np.nan)
              .astype(float)
    )

for col in columns_to_clean:
    if col in df.columns:
        df[col] = clean_numeric_column(df[col])


print(df.dtypes)


Trip_Distance_km         float64
Time_of_Day               object
Day_of_Week               object
Passenger_Count            int64
Traffic_Conditions        object
Weather                   object
Base_Fare                float64
Per_Km_Rate              float64
Per_Minute_Rate          float64
Trip_Duration_Minutes    float64
Trip_Price               float64
dtype: object


In [35]:
df['Trip_Price'].value_counts()

Unnamed: 0_level_0,count
Trip_Price,Unnamed: 1_level_1
36.2600,1
50.7890,1
85.4652,1
69.9148,1
72.1607,1
...,...
46.7471,1
45.8975,1
58.5727,1
86.8977,1


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Time_of_Day            1000 non-null   object 
 2   Day_of_Week            1000 non-null   object 
 3   Passenger_Count        1000 non-null   int64  
 4   Traffic_Conditions     1000 non-null   object 
 5   Weather                1000 non-null   object 
 6   Base_Fare              1000 non-null   float64
 7   Per_Km_Rate            1000 non-null   float64
 8   Per_Minute_Rate        1000 non-null   float64
 9   Trip_Duration_Minutes  1000 non-null   float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 86.1+ KB


In [37]:
df['Trip_Price'].fillna(df['Trip_Price'].mean(), inplace=True)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Time_of_Day            1000 non-null   object 
 2   Day_of_Week            1000 non-null   object 
 3   Passenger_Count        1000 non-null   int64  
 4   Traffic_Conditions     1000 non-null   object 
 5   Weather                1000 non-null   object 
 6   Base_Fare              1000 non-null   float64
 7   Per_Km_Rate            1000 non-null   float64
 8   Per_Minute_Rate        1000 non-null   float64
 9   Trip_Duration_Minutes  1000 non-null   float64
 10  Trip_Price             1000 non-null   float64
dtypes: float64(6), int64(1), object(4)
memory usage: 86.1+ KB


In [39]:
categorical_col=df.select_dtypes(include=['object','category']).columns

categorical_col

Index(['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather'], dtype='object')

In [41]:
cardinality=df[categorical_col].nunique()

cardinality

Unnamed: 0,0
Time_of_Day,4
Day_of_Week,2
Traffic_Conditions,3
Weather,3


In [43]:
encoder = OneHotEncoder()
encoded_array = encoder.fit_transform(df[categorical_col])
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=encoder.get_feature_names_out(categorical_col))
df = df.drop(columns=categorical_col)
df = pd.concat([df, encoded_df], axis=1)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           1000 non-null   float64
 1   Passenger_Count            1000 non-null   int64  
 2   Base_Fare                  1000 non-null   float64
 3   Per_Km_Rate                1000 non-null   float64
 4   Per_Minute_Rate            1000 non-null   float64
 5   Trip_Duration_Minutes      1000 non-null   float64
 6   Trip_Price                 1000 non-null   float64
 7   Time_of_Day_Afternoon      1000 non-null   float64
 8   Time_of_Day_Evening        1000 non-null   float64
 9   Time_of_Day_Morning        1000 non-null   float64
 10  Time_of_Day_Night          1000 non-null   float64
 11  Day_of_Week_Weekday        1000 non-null   float64
 12  Day_of_Week_Weekend        1000 non-null   float64
 13  Traffic_Conditions_High    1000 non-null   float6

In [45]:
X = df.drop(columns=['Trip_Price'])
y = df['Trip_Price']

In [46]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [47]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [49]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mse:{mse}")
print(f"r2:{r2}")

mse:312.6512200536612
r2:0.8445700437634959


In [51]:
scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [53]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [56]:
y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mse:{mse}")
print(f"r2:{r2}")

mse:302.56305594539447
r2:0.8495852261945303
