In [50]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error

import pickle

In [51]:
def import_data(url):
  return pd.read_parquet(url)

In [52]:
def process_data(df):
  print(df.shape)
  nrows = df.shape[0]
  df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(lambda x: x.total_seconds() / 60)
  df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

  categorical = ['PULocationID', 'DOLocationID']
  df[categorical] = df[categorical].astype(str)
  df['PU_DO'] = df.PULocationID + '_' + df.DOLocationID
  
  numerical = ['trip_distance', 'duration']

  return df[categorical + numerical]

In [53]:
def transform_data(df, dv=None):
  predictors = ['PULocationID', 'DOLocationID']
  target = 'duration'

  df_dicts = df[predictors].to_dict(orient='records')

  if dv:
    X = dv.transform(df_dicts)
  else:
    dv = DictVectorizer()
    X = dv.fit_transform(df_dicts)
    print(len(dv.feature_names_))
  
  y = df[target].values

  return X, y, dv

In [54]:
def compute_error(X, y, model):
  preds = model.predict(X)
  error = root_mean_squared_error(preds, y)

  return error

In [58]:
train_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
val_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

train_df = import_data(train_url)
train_df = process_data(train_df)
X_train, y_train, dv = transform_data(train_df)

val_df = import_data(val_url)
val_df = process_data(val_df)
X_val, y_val, _ = transform_data(val_df, dv)

lr = LinearRegression().fit(X_train, y_train)
train_error = compute_error(X_train, y_train, lr)
val_error = compute_error(X_val, y_val, lr)
print(f'Training Error: {train_error} | Validation Error: {val_error}')

lasso = Lasso().fit(X_train, y_train)
train_error = compute_error(X_train, y_train, lasso)
val_error = compute_error(X_val, y_val, lasso)
print(f'Training Error: {train_error} | Validation Error: {val_error}')

ridge = Ridge().fit(X_train, y_train)
train_error = compute_error(X_train, y_train, ridge)
val_error = compute_error(X_val, y_val, ridge)
print(f'Training Error: {train_error} | Validation Error: {val_error}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PU_DO'] = df.PULocationID + '_' + df.DOLocationID


518


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PU_DO'] = df.PULocationID + '_' + df.DOLocationID


Training Error: 6.845620148915177 | Validation Error: 7.423729162888127
Training Error: 8.865266000371095 | Validation Error: 9.295267574235702
Training Error: 6.845688763979081 | Validation Error: 7.423313051810117


In [61]:
with open('models/lin_reg.bin', 'wb') as f_out:
  pickle.dump((dv, lr), f_out)