### Reading the data for January 2023 and determining the number of columns

In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 

In [2]:
# Load January dataset
df_jan = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_jan.head()

# Determining the number of columns
jan_columns = len(df_jan.columns)
print(f"Number of columns in January dataset: {jan_columns}")

Number of columns in January dataset: 19


### Computing the duration variable and its standard deviation

In [3]:
# Computing the duration variable - difference between pickup & dropoff instances
df_jan["tpep_pickup_datetime"] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_jan["tpep_dropoff_datetime"] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])

df_jan["duration_minutes"] = (df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]).dt.total_seconds()/60

# Standard deviation of trip duration
standard_dev = df_jan["duration_minutes"].std()
print(f"The standard deviation of the trips duration in January: {standard_dev:.2f} minutes")

The standard deviation of the trips duration in January: 42.59 minutes


### Removing outliers and computing the fraction of records left

In [4]:
# Filtering outliers and keeping only records where the duration was between 1 and 60 minutes (inclusive)
df_jan_filter = df_jan[(df_jan["duration_minutes"] >= 1) & (df_jan["duration_minutes"] <= 60)]
df_jan_rem = len(df_jan_filter)/len(df_jan)
print(f"Fraction of records left after dropping the outliers: {df_jan_rem:.0%}")

Fraction of records left after dropping the outliers: 98%


### One-hot encoding pickup and dropoff location IDs and creating feature matrix

In [5]:
# Create a list of dictionaries
records = df_jan_filter[["PULocationID", "DOLocationID"]].to_dict(orient="records")

# Fit a dictionary vectorizer
vectorizer = DictVectorizer(sparse=False)
feature_matrix = vectorizer.fit_transform(records)

# Get the dimensionality (number of columns) of the feature matrix
num_features = feature_matrix.shape[1]
print(f"Dimensionality of this matrix (number of columns): {num_features} columns")

Dimensionality of this matrix (number of columns): 2 columns


### Training linear regression model and calculating RMSE on training data

In [6]:
# Training a Linear Regression Model
X_train = feature_matrix
y_train = df_jan_filter["duration_minutes"]

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Calculating RMSE on the training data
y_pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print(f"RMSE on training data: {rmse_train:.2f}")

RMSE on training data: 9.84


### Applying the model to the February 2023 dataset and calculating RMSE on validation

In [7]:
# Load February dataset
df_feb = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

df_feb.head()

# Convert pickup and dropoff datetime to datetime objects
df_feb["tpep_pickup_datetime"] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_feb["tpep_dropoff_datetime"] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])
df_feb["duration_minutes"] = (df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]).dt.total_seconds()/60

# Filtering outliers in February dataset
df_feb_filter = df_feb[(df_jan["duration_minutes"] >= 1) & (df_jan["duration_minutes"] <= 60)]
df_feb_rem = len(df_feb_filter)/len(df_feb)

# One-hot encode and get feature matrix for February dataset

# Create a list of dictionaries
records = df_feb_filter[["PULocationID", "DOLocationID"]].to_dict(orient="records")

# Fit a dictionary vectorizer
vectorizer = DictVectorizer(sparse=False)
feature_matrix = vectorizer.fit_transform(records)

# Initialize the model
X_val = feature_matrix
y_val = df_feb_filter["duration_minutes"]

model = LinearRegression()
model.fit(X_val, y_val)

# Calculate RMSE on the training data
y_pred_train = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred_train, squared=False)
print(f"RMSE on validation data: {rmse_val:.2f}")

  df_feb_filter = df_feb[(df_jan["duration_minutes"] >= 1) & (df_jan["duration_minutes"] <= 60)]


RMSE on validation data: 9.93
