In [9]:
import os
import urllib.request
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [11]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
filename = "yellow_tripdata_2023-01.parquet"

In [13]:
df = pd.read_parquet("yellow_tripdata_2023-01.parquet")

print(f"Taxi dataset : {len(df.columns)} columns")

Taxi dataset : 19 columns


In [17]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

len1 = len(df)
print(f"Duration : {df.duration.std():.2f} minutes")

Duration : 42.59 minutes


In [19]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
len2 = len(df)

print(f"Removed {len1 - len2} rows with duration outside [1, 60]")
print(f"Dropped {1 - len2 / len1:.2%} of rows. {len2/len1:.2%} left")

Removed 57593 rows with duration outside [1, 60]
Dropped 1.88% of rows. 98.12% left


In [21]:
df_train = df.iloc[:int(len(df)*0.8)].copy()
df_val = df.iloc[int(len(df)*0.8):].copy()

In [23]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [25]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

print("Feature matrix :", X_train.shape)

Feature matrix : (2407338, 514)


In [27]:
y_train = df_train['duration'].values
y_val = df_val['duration'].values

In [29]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [31]:
y_pred = lr.predict(X_train)
y_pred2 = lr.predict(X_val)

In [33]:
train_error = mean_squared_error(y_train, y_pred, squared=False)
val_error = mean_squared_error(y_val, y_pred2, squared=False)

print(f"Train RMSE: {train_error:.2f} minutes")
print(f"Validation RMSE: {val_error:.2f} minutes")

Train RMSE: 7.59 minutes
Validation RMSE: 8.55 minutes


