In [387]:
!python -V

Python 3.12.6


In [388]:
import pandas as pd

In [389]:
import pickle

In [390]:
import seaborn as sns
import matplotlib.pyplot as plt

In [391]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_squared_error

In [392]:
# Downloaded the data from this URL: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

df = pd.read_parquet('./data/green_tripdata_2021-01.parquet')

df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

In [393]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred)

96.8019815011275

In [394]:
# sns.displot(y_pred, label='prediction')
# sns.displot(y_train, label='actual')

# plt.legend()

In [395]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    
    df[categorical] = df[categorical].astype(str)

    return df

In [396]:
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

In [397]:
len(df_train), len(df_val)

(73908, 61921)

In [398]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']

In [399]:
df_val

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO
0,2,2021-02-01 00:34:03,2021-02-01 00:51:58,N,1.0,130,205,5.0,3.66,14.00,...,10.00,0.0,,0.3,25.30,1.0,1.0,0.00,17.916667,43_151
1,2,2021-02-01 00:04:00,2021-02-01 00:10:30,N,1.0,152,244,1.0,1.10,6.50,...,0.00,0.0,,0.3,7.80,2.0,1.0,0.00,6.500000,166_239
2,2,2021-02-01 00:18:51,2021-02-01 00:34:06,N,1.0,152,48,1.0,4.93,16.50,...,0.00,0.0,,0.3,20.55,2.0,1.0,2.75,15.250000,41_42
3,2,2021-02-01 00:53:27,2021-02-01 01:11:41,N,1.0,152,241,1.0,6.70,21.00,...,0.00,0.0,,0.3,22.30,2.0,1.0,0.00,18.233333,168_75
4,2,2021-02-01 00:57:46,2021-02-01 01:06:44,N,1.0,75,42,1.0,1.89,8.50,...,2.45,0.0,,0.3,12.25,1.0,1.0,0.00,8.966667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64567,2,2021-02-28 22:19:00,2021-02-28 22:29:00,,,129,7,,2.63,10.04,...,0.00,0.0,,0.3,10.34,,,,10.000000,74_50
64568,2,2021-02-28 23:18:00,2021-02-28 23:27:00,,,116,166,,1.87,8.33,...,1.89,0.0,,0.3,10.52,,,,9.000000,142_212
64569,2,2021-02-28 23:44:00,2021-02-28 23:58:00,,,74,151,,2.40,12.61,...,0.00,0.0,,0.3,12.91,,,,14.000000,95_196
64570,2,2021-02-28 23:07:00,2021-02-28 23:14:00,,,42,42,,1.11,11.95,...,0.00,0.0,,0.3,15.00,,,,7.000000,80_166


In [400]:
categorical = ['PU_DO'] # 'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [401]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [402]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

234.5044945995461

In [403]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [404]:
lr = Lasso(alpha=0.1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

149.019967843855

In [405]:
with open('models/lasso_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [406]:
lr = Ridge(alpha=0.1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

236.85666683313565

In [407]:
with open('models/ridge_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)