# Experiment Tracking
## Preprocessing

In [65]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error
import pickle

import mlflow

In [66]:
pd.options.mode.copy_on_write = True

In [67]:
def preprocessing(df):
    # compute target variables: trip duration in minutes
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # keep only trips of less than an hour
    df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

    # extract trip date components
    df['trip_month'] = pd.Categorical(df.lpep_pickup_datetime.dt.month)
    df['trip_dom'] = pd.Categorical(df.lpep_pickup_datetime.dt.day)
    df['trip_hour'] = pd.Categorical(df.lpep_pickup_datetime.dt.hour)

    # concatenate pickup and dropoff locations
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)

    return df

In [68]:
def one_hot_encoding(df, numerical, categorical, dv=None):
    df[categorical] = df[categorical].astype(str)
    df_dicts = df[categorical + numerical].to_dict(orient='records')

    if not dv:
        dv = DictVectorizer()
        dv.fit(df_dicts)

    X_train = dv.transform(df_dicts)

    return X_train, dv

In [69]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

In [70]:
# read im the raw data
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [71]:
# data cleaning
df_train = preprocessing(df_train)
df_val = preprocessing(df_val)

In [72]:
# variable selection
categorical = ['PU_DO']
numerical = ['trip_distance']

In [73]:
# categorical feature encoding
X_train, dv = one_hot_encoding(df_train, numerical, categorical)
X_val, _ = one_hot_encoding(df_val, numerical, categorical, dv=dv)

In [74]:
target = 'duration'
y_train = df_train[target]
y_val = df_val[target]

## Manual Logging

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

### Lasso

In [None]:
alpha = .01
lr = Lasso(alpha=alpha)
train_model(lr, X_train, y_train)
y_preds = lr.predict(X_val)
rmse = root_mean_squared_error(y_val, y_preds)

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Armand Winant")

    mlflow.log_param("training-data", "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
    mlflow.log_param("validation-data", "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

    mlflow.log_param("alpha", alpha)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(
        
    )