In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def read_data(path: str):
    
    df = pd.read_parquet(path)
    print(f'Dataframe shape {df.shape}')

    # target dimension
    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60)
    display(df["duration"].describe())

    # outliers removal
    print(df['duration'].between(1,60).sum()/df.shape[0]*100)
    df = df[df["duration"].between(1,60)]

    # features
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df1 = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df2 = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

Dataframe shape (3066766, 19)


count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

98.1220282212598
Dataframe shape (2913955, 19)


count    2.913955e+06
mean     1.601591e+01
std      4.284210e+01
min     -4.361667e+01
25%      7.250000e+00
50%      1.180000e+01
75%      1.876667e+01
max      7.053617e+03
Name: duration, dtype: float64

98.00944077722545


In [4]:
dv = DictVectorizer()
categorical = ['PULocationID', 'DOLocationID']

train_dicts = df1[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
len(dv.feature_names_)

515

In [5]:
target = 'duration'
y_train = df1[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

np.float64(7.6492624397080675)

In [6]:
vald_dicts = df2[categorical].to_dict(orient='records')
X_vald = dv.transform(vald_dicts)
len(dv.feature_names_)

515

In [7]:
y_pred = lr.predict(X_vald)
np.sqrt(mean_squared_error(df2[target].values, y_pred))

np.float64(7.81181211389241)