# Work with trip Dataset

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [24]:
trip_df = pd.read_csv(r"E:\Assignment\Assignment_6\Dataset\trip.csv")
trip_df

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Fare,Tips,Tolls,Extras,Trip Total,Payment Type
0,3/1/2024 0:00,3/1/2024 0:00,15.0,0.09,3.25,0.00,0.0,35.0,38.75,Credit Card
1,3/1/2024 0:00,3/1/2024 0:15,900.0,3.00,12.00,0.00,0.0,0.0,12.00,Cash
2,3/1/2024 0:00,3/1/2024 0:00,711.0,5.84,16.75,4.45,0.0,5.0,26.70,Credit Card
3,3/1/2024 0:00,3/1/2024 0:30,1770.0,13.36,34.75,7.85,0.0,4.0,47.10,Credit Card
4,3/1/2024 0:00,3/1/2024 0:15,849.0,6.13,18.51,4.31,0.0,0.0,22.82,Mobile
...,...,...,...,...,...,...,...,...,...,...
49995,2/27/2024 10:15,2/27/2024 11:00,2580.0,0.00,44.50,0.00,0.0,5.0,49.50,Cash
49996,2/27/2024 10:15,2/27/2024 10:15,360.0,1.30,6.75,0.00,0.0,0.0,6.75,Cash
49997,2/27/2024 10:15,2/27/2024 10:45,1251.0,11.24,29.75,0.00,0.0,0.0,29.75,Prcard
49998,2/27/2024 10:15,2/27/2024 10:15,172.0,1.54,6.75,0.00,0.0,0.0,6.75,Cash


In [25]:
trip_df.isnull().sum()

Trip Start Timestamp      0
Trip End Timestamp        0
Trip Seconds              5
Trip Miles                0
Fare                    145
Tips                    145
Tolls                   145
Extras                  145
Trip Total              145
Payment Type              0
dtype: int64

In [26]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Trip Start Timestamp  50000 non-null  object 
 1   Trip End Timestamp    50000 non-null  object 
 2   Trip Seconds          49995 non-null  float64
 3   Trip Miles            50000 non-null  float64
 4   Fare                  49855 non-null  float64
 5   Tips                  49855 non-null  float64
 6   Tolls                 49855 non-null  float64
 7   Extras                49855 non-null  float64
 8   Trip Total            49855 non-null  float64
 9   Payment Type          50000 non-null  object 
dtypes: float64(7), object(3)
memory usage: 3.8+ MB


In [27]:
trip_df.columns

Index(['Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds',
       'Trip Miles', 'Fare', 'Tips', 'Tolls', 'Extras', 'Trip Total',
       'Payment Type'],
      dtype='object')

# Data Preprocessing & Handle missing values in trip dataset

In [31]:
for column in trip_df.select_dtypes(include=np.number).columns:
    trip_df[column].fillna(trip_df[column].median(), inplace=True)

for column in trip_df.select_dtypes(include=['object']).columns:
    trip_df[column].fillna(trip_df[column].mode()[0], inplace=True)

In [29]:
trip_df.isnull().sum()

Trip Start Timestamp    0
Trip End Timestamp      0
Trip Seconds            0
Trip Miles              0
Fare                    0
Tips                    0
Tolls                   0
Extras                  0
Trip Total              0
Payment Type            0
dtype: int64

# Convert timestamps to datetime

In [33]:
trip_df['Trip Start Timestamp'] = pd.to_datetime(trip_df['Trip Start Timestamp'])
trip_df['Trip End Timestamp'] = pd.to_datetime(trip_df['Trip End Timestamp'])
trip_df

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Fare,Tips,Tolls,Extras,Trip Total,Payment Type
0,2024-03-01 00:00:00,2024-03-01 00:00:00,15.0,0.09,3.25,0.00,0.0,35.0,38.75,Credit Card
1,2024-03-01 00:00:00,2024-03-01 00:15:00,900.0,3.00,12.00,0.00,0.0,0.0,12.00,Cash
2,2024-03-01 00:00:00,2024-03-01 00:00:00,711.0,5.84,16.75,4.45,0.0,5.0,26.70,Credit Card
3,2024-03-01 00:00:00,2024-03-01 00:30:00,1770.0,13.36,34.75,7.85,0.0,4.0,47.10,Credit Card
4,2024-03-01 00:00:00,2024-03-01 00:15:00,849.0,6.13,18.51,4.31,0.0,0.0,22.82,Mobile
...,...,...,...,...,...,...,...,...,...,...
49995,2024-02-27 10:15:00,2024-02-27 11:00:00,2580.0,0.00,44.50,0.00,0.0,5.0,49.50,Cash
49996,2024-02-27 10:15:00,2024-02-27 10:15:00,360.0,1.30,6.75,0.00,0.0,0.0,6.75,Cash
49997,2024-02-27 10:15:00,2024-02-27 10:45:00,1251.0,11.24,29.75,0.00,0.0,0.0,29.75,Prcard
49998,2024-02-27 10:15:00,2024-02-27 10:15:00,172.0,1.54,6.75,0.00,0.0,0.0,6.75,Cash


# Extract time-based features

In [34]:
trip_df['Hour'] = trip_df['Trip Start Timestamp'].dt.hour
trip_df['Day'] = trip_df['Trip Start Timestamp'].dt.dayofweek
trip_df

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Hour,Day
0,2024-03-01 00:00:00,2024-03-01 00:00:00,15.0,0.09,3.25,0.00,0.0,35.0,38.75,Credit Card,0,4
1,2024-03-01 00:00:00,2024-03-01 00:15:00,900.0,3.00,12.00,0.00,0.0,0.0,12.00,Cash,0,4
2,2024-03-01 00:00:00,2024-03-01 00:00:00,711.0,5.84,16.75,4.45,0.0,5.0,26.70,Credit Card,0,4
3,2024-03-01 00:00:00,2024-03-01 00:30:00,1770.0,13.36,34.75,7.85,0.0,4.0,47.10,Credit Card,0,4
4,2024-03-01 00:00:00,2024-03-01 00:15:00,849.0,6.13,18.51,4.31,0.0,0.0,22.82,Mobile,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-02-27 10:15:00,2024-02-27 11:00:00,2580.0,0.00,44.50,0.00,0.0,5.0,49.50,Cash,10,1
49996,2024-02-27 10:15:00,2024-02-27 10:15:00,360.0,1.30,6.75,0.00,0.0,0.0,6.75,Cash,10,1
49997,2024-02-27 10:15:00,2024-02-27 10:45:00,1251.0,11.24,29.75,0.00,0.0,0.0,29.75,Prcard,10,1
49998,2024-02-27 10:15:00,2024-02-27 10:15:00,172.0,1.54,6.75,0.00,0.0,0.0,6.75,Cash,10,1


# Encode categorical variable 'Payment Type'

In [36]:
payment_encoder = LabelEncoder()
trip_df['Payment Type'] = payment_encoder.fit_transform(trip_df['Payment Type'])
trip_df

Unnamed: 0,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Hour,Day
0,2024-03-01 00:00:00,2024-03-01 00:00:00,15.0,0.09,3.25,0.00,0.0,35.0,38.75,1,0,4
1,2024-03-01 00:00:00,2024-03-01 00:15:00,900.0,3.00,12.00,0.00,0.0,0.0,12.00,0,0,4
2,2024-03-01 00:00:00,2024-03-01 00:00:00,711.0,5.84,16.75,4.45,0.0,5.0,26.70,1,0,4
3,2024-03-01 00:00:00,2024-03-01 00:30:00,1770.0,13.36,34.75,7.85,0.0,4.0,47.10,1,0,4
4,2024-03-01 00:00:00,2024-03-01 00:15:00,849.0,6.13,18.51,4.31,0.0,0.0,22.82,3,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-02-27 10:15:00,2024-02-27 11:00:00,2580.0,0.00,44.50,0.00,0.0,5.0,49.50,0,10,1
49996,2024-02-27 10:15:00,2024-02-27 10:15:00,360.0,1.30,6.75,0.00,0.0,0.0,6.75,0,10,1
49997,2024-02-27 10:15:00,2024-02-27 10:45:00,1251.0,11.24,29.75,0.00,0.0,0.0,29.75,5,10,1
49998,2024-02-27 10:15:00,2024-02-27 10:15:00,172.0,1.54,6.75,0.00,0.0,0.0,6.75,0,10,1


# Define features and target

In [38]:
X_trip = trip_df[['Trip Seconds', 'Trip Miles', 'Tolls', 'Extras', 'Payment Type', 'Hour', 'Day']]
y_trip = trip_df['Fare']
y_trip

0         3.25
1        12.00
2        16.75
3        34.75
4        18.51
         ...  
49995    44.50
49996     6.75
49997    29.75
49998     6.75
49999    14.00
Name: Fare, Length: 50000, dtype: float64

# Train-test split 

In [39]:
X_train_trip, X_test_trip, y_train_trip, y_test_trip = train_test_split(X_trip, y_trip, test_size=0.2, random_state=42)

# Train linear regression model

In [40]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_trip, y_train_trip)

# Predictions

In [41]:
y_pred_trip = lin_reg.predict(X_test_trip)
y_pred_trip 

array([ 8.94605276, 21.94365292, 14.8205309 , ...,  8.34360793,
        9.89858173, 10.94321381])

# Model evaluation

In [42]:
mae = mean_absolute_error(y_test_trip, y_pred_trip)
mse = mean_squared_error(y_test_trip, y_pred_trip)
r2 = r2_score(y_test_trip, y_pred_trip)
adj_r2 = 1 - (1-r2) * (len(y_test_trip)-1) / (len(y_test_trip)-X_trip.shape[1]-1)

print("Linear Regression Metrics:")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")
print(f"Adjusted R²: {adj_r2}")

Linear Regression Metrics:
MAE: 4.150866756993996
MSE: 113.21011449892086
R²: 0.6334403725012976
Adjusted R²: 0.6331835753243069
