# Duration Prediction Notebook - Training and Inference

Notebook is used to run code again to create a new model in MLflow with Scikit-Learn Pipeline. Pipelines allow you to combine preprocessing steps and model fitting into a single object which results in cleaner and more readable code. \
Two new functions `read_dataframe` and `prepare_dictionaries` are added.

##### Packages

In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

##### MLflow Setup

In [None]:
import mlflow
import os

os.environ["AWS_PROFILE"] = "" # fill in with your AWS profile
TRACKING_SERVER_HOST = "" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("experiment-name") # add name for experiment

##### Pre-processing functions

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def prepare_dictionaries(df: pd.DataFrame):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

##### Load Train and Test Data

In [None]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet')

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

dict_train = prepare_dictionaries(df_train)
dict_val = prepare_dictionaries(df_val)

##### XGBoost Model Training

In [None]:
import xgboost as xgb

In [None]:
with mlflow.start_run():

    best_params = dict(
        learning_rate=0.10139338184768387, 
        max_depth=9, 
        min_child_weight=2.1862253417827513, 
        objective='reg:linear', 
        reg_alpha=0.09153522324337644,
        reg_lambda=0.024435485947183297,
        seed=42
        )

    mlflow.log_params(best_params)

    pipeline = make_pipeline(
        DictVectorizer(),
        xgb.XGBRegressor(**best_params)
    )

    pipeline.fit(dict_train, y_train)
    y_pred = pipeline.predict(dict_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(best_params, rmse)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(pipeline, artifact_path="model")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
