In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_1 = pd.read_parquet('data/yellow_tripdata_2023-01.parquet', engine='pyarrow')
df_2 = pd.read_parquet('data/yellow_tripdata_2023-02.parquet', engine='pyarrow')

In [3]:
# Question 1
print(f"Number of columns: {df_1.shape[1]}")

Number of columns: 19


In [4]:
# create new column trip_duration from the difference between dropoff and pickup times for Question 2 and 3
df_1['trip_duration'] = (df_1['tpep_dropoff_datetime'] - df_1['tpep_pickup_datetime']).dt.total_seconds() / 60
df_2['trip_duration'] = (df_2['tpep_dropoff_datetime'] - df_2['tpep_pickup_datetime']).dt.total_seconds() / 60

In [5]:
# Question 2
print(f"Standard deviation of the trips duration in January: {df_1['trip_duration'].std():.2f}")

Standard deviation of the trips duration in January: 42.59


In [6]:
# Question 3 
res = round(df_1[(df_1['trip_duration'] >= 1) & (df_1['trip_duration'] <= 60)].shape[0] / df_1.shape[0], 2) * 100
print(f"Fraction of trips that are between 1 and 60 minutes: {res} %")

df_1['trip_duration'] = df_1[(df_1['trip_duration'] >= 1) & (df_1['trip_duration'] <= 60)]['trip_duration']
df_2['trip_duration'] = df_2[(df_2['trip_duration'] >= 1) & (df_2['trip_duration'] <= 60)]['trip_duration']

Fraction of trips that are between 1 and 60 minutes: 98.0 %


In [7]:
# set up for question 4/5/6
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

vectorizer = DictVectorizer(sparse=True)
model = LinearRegression()

df_train = df_1[["PULocationID", "DOLocationID", "trip_duration"]]
df_test = df_2[["PULocationID", "DOLocationID", "trip_duration"]]

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train.loc[:, 'PULocationID'] = df_train['PULocationID'].astype(str)
df_train.loc[:, 'DOLocationID'] = df_train['DOLocationID'].astype(str)
df_test.loc[:, 'PULocationID'] = df_test['PULocationID'].astype(str)
df_test.loc[:, 'DOLocationID'] = df_test['DOLocationID'].astype(str)

x_train, y_train = df_train.drop(columns='trip_duration'), df_train['trip_duration']
x_test, y_test = df_test.drop(columns='trip_duration'), df_test['trip_duration']

train_dict = x_train.to_dict(orient='records')
x_train_encoded = vectorizer.fit_transform(train_dict)

test_dict = x_test.to_dict(orient='records')
x_test_encoded = vectorizer.transform(test_dict)

In [8]:
# Question 4
print(f"Number of features: {x_train_encoded.shape[1]}")

Number of features: 515


In [9]:
model.fit(x_train_encoded, y_train)

In [10]:
# Question 5 and 6
y_pred_train = model.predict(x_train_encoded)

rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))
train_rmse = rmse(y_train, y_pred_train)
print(f"Train RMSE: {train_rmse:.2f}")

y_pred_test = model.predict(x_test_encoded)
test_rmse = rmse(y_test, y_pred_test)
print(f"Test RMSE: {test_rmse:.2f}")

Train RMSE: 7.65
Test RMSE: 7.81
