In [1]:
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

In [2]:
DATA_DIR = '../data/'

In [3]:
%%time
df = pd.read_parquet(DATA_DIR + 'fhv_tripdata_2021-01.parquet')
print(f'Q1. Shape: {df.shape}')
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f'Q2. Average duration: {df.duration.mean():.2f}')

df['PUlocationID'] = df['PUlocationID'].replace({'nan': '-1'})
df['DOlocationID'] = df['DOlocationID'].replace({'nan': '-1'})
print(f"Q3. Missing values: {df['PUlocationID'].isna().sum() / df.shape[0] * 100:.2f}%")

df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PUlocationID', 'DOlocationID']
numerical = []

df[categorical] = df[categorical].astype(str)

Q1. Shape: (1154112, 7)
Q2. Average duration: 19.17
Q3. Missing values: 83.03%
CPU times: user 6.45 s, sys: 179 ms, total: 6.63 s
Wall time: 6.51 s


In [4]:
%%time
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
print(f'Q4. Features: {X_train.shape}')

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print(f"Q5. RMSE: {mean_squared_error(y_train, y_pred, squared=False)}")

Q4. Features: (1109826, 525)
Q5. RMSE: 10.528519427219633
CPU times: user 11.8 s, sys: 4.38 s, total: 16.1 s
Wall time: 6.44 s


In [5]:
%%time
df_val = pd.read_parquet(DATA_DIR + 'fhv_tripdata_2021-02.parquet')
df_val['duration'] = df_val['dropOff_datetime'] - df_val['pickup_datetime']
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

df_val['PUlocationID'] = df_val['PUlocationID'].replace({'nan': '-1'})
df_val['DOlocationID'] = df_val['DOlocationID'].replace({'nan': '-1'})

df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical] = df_val[categorical].astype(str)

CPU times: user 5.87 s, sys: 143 ms, total: 6.02 s
Wall time: 5.84 s


In [6]:
train_dicts_val = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(train_dicts_val)

y_val = df_val[target].values

y_val_pred = lr.predict(X_val)

print(f"Q6. RMSE: {mean_squared_error(y_val, y_val_pred, squared=False)}")

Q6. RMSE: 11.01428685575068
