In [63]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline

import pickle

In [64]:
pd.options.mode.chained_assignment = None  # default='warn'

In [65]:
def import_data(url):
  return pd.read_parquet(url)

In [66]:
def process_data(df):
  df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).apply(lambda x: x.total_seconds() / 60)
  df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

  return df

In [67]:
def prepare_features(df, dv=None):
  categorical_variables = ['PULocationID', 'DOLocationID']
  numerical_variables = ['trip_distance']

  df[categorical_variables] = df[categorical_variables].astype(str)
  df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

  input_variables = ['PU_DO', 'trip_distance']
  # input_variables = ['PULocationID', 'DOLocationID']

  df_dicts = df[input_variables].to_dict(orient="records")

  return df_dicts

In [68]:
def transform_features(data_dicts, dv=None):
  if dv:
    X = dv.transform(data_dicts)
  else:
    dv = DictVectorizer()
    X = dv.fit_transform(data_dicts)
  
  return X, dv

In [69]:
def compute_error(X, y, model):
  preds = model.predict(X)
  return root_mean_squared_error(preds, y)

In [70]:
url_train = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet"
url_val = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet"

df_train = import_data(url_train)
df_val = import_data(url_val)

df_train = process_data(df_train)
df_val = process_data(df_val)

dicts_train = prepare_features(df_train)
dicts_val = prepare_features(df_val)

y_train = df_train.duration
y_val = df_val.duration

In [71]:
pipeline = make_pipeline(
  DictVectorizer(),
  LinearRegression()
)

pipeline.fit(dicts_train, y_train)
rmse_val = compute_error(dicts_val, y_val, pipeline)
print(f"Validation error: {rmse_val}")


Validation error: 6.03727552054262


In [72]:
with open("web-service/lin_reg.bin", "wb") as f_out:
  pickle.dump(pipeline, f_out)