In [4]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error

import mlflow

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
# mlflow ui --backend-store-uri sqlite:///mlflow.db
mlflow.set_tracking_uri(uri="sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

## Data Processing

In [7]:
def import_data(url):
  return pd.read_parquet(url)

In [8]:
def process_data(df):
  df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(lambda x: x.total_seconds() / 60)
  df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

  categorical = ['PULocationID', 'DOLocationID']
  df[categorical] = df[categorical].astype(str)
  df['PU_DO'] = df.PULocationID + '_' + df.DOLocationID
  categorical.append('PU_DO')
  
  numerical = ['trip_distance', 'duration']

  return df[categorical + numerical]

In [9]:
def transform_data(df, dv=None):
  # predictors = ['PULocationID', 'DOLocationID']
  predictors = ['PU_DO', 'trip_distance']
  target = 'duration'

  df_dicts = df[predictors].to_dict(orient='records')

  if dv:
    X = dv.transform(df_dicts)
  else:
    dv = DictVectorizer()
    X = dv.fit_transform(df_dicts)
  
  y = df[target].values

  return X, y, dv

In [10]:
def compute_error(X, y, model):
  preds = model.predict(X)
  error = root_mean_squared_error(preds, y)

  return error

In [11]:
train_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
val_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

train_df = import_data(train_url)
train_df = process_data(train_df)
X_train, y_train, dv = transform_data(train_df)

val_df = import_data(val_url)
val_df = process_data(val_df)
X_val, y_val, _ = transform_data(val_df, dv)

## Manual Logging

In [None]:
alpha = .01
model = Lasso(alpha=alpha)


model.fit(X_train, y_train)
rmse = compute_error(X_val, y_val, model)

with mlflow.start_run():
  mlflow.set_tag("developer", "Bastien Winant")

  mlflow.log_params({
    "train_data": train_url,
    "val_data": val_url,
    "alpha": alpha
  })


  mlflow.log_metric("rmse", rmse)