#### NYC Taxi Duration Prediction with MLflow

##### Packages

In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

##### MLflow Setup

In [None]:
import mlflow
import os
os.environ["AWS_PROFILE"] = "" # fill in with your AWS profile
TRACKING_SERVER_HOST = "" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("xgboost-nyctaxi-experiment")

In [None]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

##### Pre-processing

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

##### Load Train and Test Data

In [None]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet')

In [None]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [None]:
categorical = ['PU_DO'] 
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

##### XGBoost Model Training

Try out an xgboost model and optimize the hyperparameter tuning with a package hyperopt

In [None]:
import xgboost as xgb

# fmin: tries to minimise the objective function
# tpe: algorithm that controls the flow
# hp: library containing methods to define the search space
# STATUS_OK: signal if the optimization is succesful at the end of each run
# Trials: will keep track of information from each run
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# scope: define range of integer type
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

We create an objective function that trains the xgboost model with a set of hyperparameters (from hyperopt) and then validated against our validation data. 

For each set of hyperparameters and the model's corresponding performance score, we record them in mlflow by wrapping it around the function.

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

Ranges which hyperop should explore hyperparameters of XGBoost.
We use hp to create different kinds of statistical distributions for our parameters.
http://hyperopt.github.io/hyperopt/getting-started/search_spaces/

In [None]:
search_space = {
    #  hyperparameter should be optimized within the range of 4 to 100, with integer values
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    # learning rate range is [10^(-3), 10^(0)]
    # logarithmic uniform distribution means values closer to 0.001 are more likely
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    # defines the loss function to be minimised
    # change to reg:squarederror
    'objective': 'reg:linear',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

##### Choose best parameters to train model

In [None]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.10139338184768387,
        'max_depth': 9,
        'min_child_weight': 2.1862253417827513,
        'objective': 'reg:linear',
        'reg_alpha': 0.09153522324337644,
        'reg_lambda': 0.024435485947183297,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.xgboost.log_model(booster, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
