In [55]:
!python -V

Python 3.9.7


In [56]:
import pandas as pd

In [57]:
import pickle

In [58]:
import seaborn as sns
import matplotlib.pyplot as plt

In [59]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [60]:
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
number_of_rides = len(df)
print(number_of_rides)

1154112


In [61]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [62]:
df.duration.mean()

19.1672240937939

In [63]:
df.duration.describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [64]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

number_of_rides_wo_outliers = len(df)
rides_dropped = number_of_rides - number_of_rides_wo_outliers
print(rides_dropped)

44286


In [66]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical].isna().sum() / len(df)

PUlocationID    0.835273
DOlocationID    0.133270
dtype: float64

In [68]:
df[categorical] = df[categorical].fillna(-1).astype(str)

In [70]:
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

In [71]:
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [72]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.52851910721103

In [73]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(-1).astype(str)
    
    return df

In [74]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

In [75]:
len(df_train), len(df_val)

(1109826, 990113)

In [79]:
train_dicts = df_train[categorical].to_dict(orient='records')
val_dicts = df_val[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)

mean_squared_error(y_train, y_pred_train, squared=False)

10.52851910721103

In [82]:
y_pred_val = lr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
answers = [7.85, 12.85, 17.85, 22.85]
print(rmse)
print(rmse - answers)

11.014283190951092
[  3.16428319  -1.83571681  -6.83571681 -11.83571681]
