In [7]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import numpy as np

# Step 1: Read the dataset from the provided URL
url_january = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
df_january = pd.read_parquet(url_january)
num_columns = df_january.shape[1]
print(f'Number of columns in the January dataset: {num_columns}')

# Step 2: Compute the duration in minutes
df_january['tpep_pickup_datetime'] = pd.to_datetime(df_january['tpep_pickup_datetime'])
df_january['tpep_dropoff_datetime'] = pd.to_datetime(df_january['tpep_dropoff_datetime'])
df_january['duration'] = (df_january['tpep_dropoff_datetime'] - df_january['tpep_pickup_datetime']).dt.total_seconds() / 60

# Step 3: Drop outliers (duration between 1 and 60 minutes)
filtered_df = df_january[(df_january['duration'] >= 1) & (df_january['duration'] <= 60)]

# Fraction of records remaining after dropping outliers
fraction_remaining = len(filtered_df) / len(df_january)
print(f'Fraction of records remaining: {fraction_remaining:.2%}')

# Step 4: One-hot encoding for PULocationID and DOLocationID
filtered_df['PULocationID'] = filtered_df['PULocationID'].astype(str)
filtered_df['DOLocationID'] = filtered_df['DOLocationID'].astype(str)
dicts = filtered_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Fit a dictionary vectorizer with sparse output
dv = DictVectorizer(sparse=True)
X = dv.fit_transform(dicts)

# Print the dimensionality of the feature matrix
dimensionality = X.shape[1]
print(f'Dimensionality of the feature matrix: {dimensionality}')

# Step 5: Train a linear regression model using a sparse matrix
y_train = filtered_df['duration'].values
model = LinearRegression()
model.fit(X, y_train)

# Step 6: Calculate the RMSE on the training data
y_train_pred = model.predict(X)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f'RMSE on training data: {rmse_train:.2f}')

# Step 7: Read the February dataset and apply the same transformations
url_february = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'
df_february = pd.read_parquet(url_february)
df_february['tpep_pickup_datetime'] = pd.to_datetime(df_february['tpep_pickup_datetime'])
df_february['tpep_dropoff_datetime'] = pd.to_datetime(df_february['tpep_dropoff_datetime'])
df_february['duration'] = (df_february['tpep_dropoff_datetime'] - df_february['tpep_pickup_datetime']).dt.total_seconds() / 60
filtered_df_feb = df_february[(df_february['duration'] >= 1) & (df_february['duration'] <= 60)]
filtered_df_feb['PULocationID'] = filtered_df_feb['PULocationID'].astype(str)
filtered_df_feb['DOLocationID'] = filtered_df_feb['DOLocationID'].astype(str)
dicts_feb = filtered_df_feb[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(dicts_feb)
y_val = filtered_df_feb['duration'].values

# Step 8: Predict on the validation data and calculate the RMSE
y_val_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'RMSE on validation data: {rmse_val:.2f}')


Number of columns in the January dataset: 19
Fraction of records remaining: 98.12%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['PULocationID'] = filtered_df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['DOLocationID'] = filtered_df['DOLocationID'].astype(str)


Dimensionality of the feature matrix: 515
RMSE on training data: 7.65


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_feb['PULocationID'] = filtered_df_feb['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_feb['DOLocationID'] = filtered_df_feb['DOLocationID'].astype(str)


RMSE on validation data: 7.81
