In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
current_path = Path.cwd()
print(f"Current path: {current_path}")
data_path = current_path / "Data\yellow_tripdata_2023-01.parquet"

Current path: d:\mLOPS\2024-MLOPS-ZOOMCAMP\HW1


In [3]:
df_jan = pd.read_parquet(data_path)
print("The total columns in January datasets :",len(df_jan.columns))


The total columns in January datasets : 19


In [4]:
df_jan["duration_datetime"] = df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]

In [5]:
df_jan['trip_duration_minutes'] = df_jan['duration_datetime'].dt.total_seconds() / 60

In [6]:
trip_duration_std = df_jan['trip_duration_minutes'].std()
print(f"Standard Deviation of Trip Durations (in minutes): {trip_duration_std:.2f}")

Standard Deviation of Trip Durations (in minutes): 42.59


In [7]:
filtered_df = df_jan[(df_jan["trip_duration_minutes"]>=1) & (df_jan["trip_duration_minutes"]<=60)]

In [8]:
fraction_left = len(filtered_df) / len(df_jan)

# Display the fraction
print(f"Fraction of records left after dropping outliers: {fraction_left:.2%}")

Fraction of records left after dropping outliers: 98.12%


In [9]:
filtered_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee',
       'duration_datetime', 'trip_duration_minutes'],
      dtype='object')

In [10]:
features =filtered_df[['PULocationID', 'DOLocationID']]
features = features.astype('str')

In [11]:
data_dicts = features.to_dict(orient='records')

# Fit a DictVectorizer
dict_vectorizer = DictVectorizer()
feature_matrix=dict_vectorizer.fit_transform(data_dicts)

# Display the feature matrix
print(feature_matrix)

  (0, 43)	1.0
  (0, 325)	1.0
  (1, 148)	1.0
  (1, 456)	1.0
  (2, 149)	1.0
  (2, 461)	1.0
  (3, 227)	1.0
  (3, 299)	1.0
  (4, 237)	1.0
  (4, 266)	1.0
  (5, 38)	1.0
  (5, 325)	1.0
  (6, 45)	1.0
  (6, 409)	1.0
  (7, 108)	1.0
  (7, 304)	1.0
  (8, 147)	1.0
  (8, 328)	1.0
  (9, 6)	1.0
  (9, 303)	1.0
  (10, 225)	1.0
  (10, 404)	1.0
  (11, 178)	1.0
  (11, 494)	1.0
  (12, 45)	1.0
  :	:
  (3009160, 328)	1.0
  (3009161, 155)	1.0
  (3009161, 494)	1.0
  (3009162, 203)	1.0
  (3009162, 306)	1.0
  (3009163, 50)	1.0
  (3009163, 325)	1.0
  (3009164, 242)	1.0
  (3009164, 401)	1.0
  (3009165, 54)	1.0
  (3009165, 326)	1.0
  (3009166, 64)	1.0
  (3009166, 482)	1.0
  (3009167, 46)	1.0
  (3009167, 401)	1.0
  (3009168, 203)	1.0
  (3009168, 266)	1.0
  (3009169, 233)	1.0
  (3009169, 271)	1.0
  (3009170, 150)	1.0
  (3009170, 273)	1.0
  (3009171, 237)	1.0
  (3009171, 400)	1.0
  (3009172, 45)	1.0
  (3009172, 435)	1.0


In [12]:
print("The dimensionality of this matrix (number of columns) : ",feature_matrix.shape[1])

The dimensionality of this matrix (number of columns) :  515


In [13]:
target = filtered_df['trip_duration_minutes'].values

In [14]:
target

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [15]:
model = LinearRegression()
model.fit(feature_matrix, target)

# Predict target variable on the training data
predictions = model.predict(feature_matrix)

In [16]:
predictions

array([11.5271505 , 10.89768796, 11.32570713, ..., 11.73770536,
       12.70523327, 11.54227397])

In [17]:


# Calculate RMSE
mse = mean_squared_error(target, predictions)

# Display the RMSE
print(f"Mean Squared Error (MSE) on the training data: {mse:.2f}")

Mean Squared Error (MSE) on the training data: 58.51


In [18]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) on the training data: {rmse:.2f}")

Root Mean Squared Error (RMSE) on the training data: 7.65


## Validation set 

In [19]:
def data_preprocessing(df):
    df["duration_datetime"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df['trip_duration_minutes'] = df['duration_datetime'].dt.total_seconds() / 60
    filtered_df = df[(df["trip_duration_minutes"]>=1) & (df["trip_duration_minutes"]<=60)]
    return filtered_df

In [20]:
def features_extraction(df):
    features =df[['PULocationID', 'DOLocationID']]
    features = features.astype('str')
    data_dicts = features.to_dict(orient='records')
    feature_matrix=dict_vectorizer.transform(data_dicts)
    target = df['trip_duration_minutes'].values
    return feature_matrix, target

    
    

In [21]:
data_path = current_path / "Data\yellow_tripdata_2023-02.parquet"
df_val = pd.read_parquet(data_path)
df_val_filtered = data_preprocessing(df_val)
X_test,y_test = features_extraction(df_val_filtered)


    

In [22]:
X_test.shape

(2855951, 515)

In [23]:
predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error (RMSE) on the validation data: {rmse:.2f}")

Root Mean Squared Error (RMSE) on the validation data: 7.81
