In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error

import pickle

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def import_data(url):
  return pd.read_parquet(url)

In [4]:
def process_data(df):
  df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(lambda x: x.total_seconds() / 60)
  df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

  return df

In [5]:
def prepare_features(df, dv=None):
  categorical_variables = ['PULocationID', 'DOLocationID']
  numerical_variables = ['trip_distance']

  df[categorical_variables] = df[categorical_variables].astype(str)
  df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

  input_variables = ['PU_DO', 'trip_distance']
  # input_variables = ['PULocationID', 'DOLocationID']

  df_dicts = df[input_variables].to_dict(orient="records")

  if dv:
    X = dv.transform(df_dicts)
  else:
    dv = DictVectorizer()
    X = dv.fit_transform(df_dicts)
  
  y = df.duration.values

  return X, y, dv

In [6]:
def compute_error(X, y, model):
  preds = model.predict(X)
  return root_mean_squared_error(preds, y)

In [7]:
url_train = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
url_val = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

df_train = import_data(url_train)
df_val = import_data(url_val)
print(df_train.shape)

df_train = process_data(df_train)
df_val = process_data(df_val)

X_train, y_train, dv = prepare_features(df_train)
X_val, y_val, _ = prepare_features(df_val, dv)
print(len(dv.feature_names_))

lr = LinearRegression().fit(X_train, y_train)
rmse_train = compute_error(X_train, y_train, lr)
rmse_val = compute_error(X_val, y_val, lr)
print(f"Training error: {rmse_train} | Validation error: {rmse_val}")

lasso = Lasso().fit(X_train, y_train)
rmse_train = compute_error(X_train, y_train, lasso)
rmse_val = compute_error(X_val, y_val, lasso)
print(f"Training error: {rmse_train} | Validation error: {rmse_val}")

ridge = Ridge().fit(X_train, y_train)
rmse_train = compute_error(X_train, y_train, ridge)
rmse_val = compute_error(X_val, y_val, ridge)
print(f"Training error: {rmse_train} | Validation error: {rmse_val}")

(3066766, 19)
21802
Training error: 5.144453437201816 | Validation error: 5.256177130486534
Training error: 9.938398578736242 | Validation error: 10.066734566474242
Training error: 5.094827048630349 | Validation error: 5.2207368150952504


In [8]:
with open("web-service/lin_reg.bin", "wb") as f_out:
  pickle.dump((dv, lr), f_out)