# Model Training

In [None]:
dbutils.widgets.text("environment", "dev")
dbutils.widgets.text("experiment_name", "/Users/brian.law@databricks.com/[dev brian_law] dev-taxi-prediction-model")

In [None]:
import mlflow
from databricks import feature_store

In [None]:
curr_env = dbutils.widgets.get("environment")
curr_catalog = f'brian_ml_{curr_env}'

mlflow.set_registry_uri("databricks-uc")

fs = feature_store.FeatureStoreClient()

## Build Training Dataset

In [None]:
raw_data = spark.table(f'{curr_catalog}.warehouse.raw_data')

pickup_features_table = f'{curr_catalog}.warehouse.trip_pickup_time_series_features'
dropoff_features_table = f"{curr_catalog}.warehouse.trip_dropoff_time_series_features"

pickup_feature_lookups = [
    feature_store.FeatureLookup(
        table_name=pickup_features_table,
        feature_names=[
            "mean_fare_window_1h_pickup_zip",
            "count_trips_window_1h_pickup_zip",
        ],
        lookup_key=["pickup_zip"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]

dropoff_feature_lookups = [
    feature_store.FeatureLookup(
        table_name=dropoff_features_table,
        feature_names=["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"],
        lookup_key=["dropoff_zip"],
        timestamp_lookup_key="tpep_dropoff_datetime",
    ),
]


In [None]:
exclude_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]

training_set = fs.create_training_set(
    raw_data,
    feature_lookups=pickup_feature_lookups + dropoff_feature_lookups,
    label="fare_amount",
    exclude_columns=exclude_columns,
)

training_df = training_set.load_df()

display(training_df)

## Train Model

In [None]:
mlflow.set_experiment(dbutils.widgets.get("experiment_name"))

from sklearn.model_selection import train_test_split
from mlflow.tracking import MlflowClient
import lightgbm as lgb
import mlflow.lightgbm
from mlflow.models.signature import infer_signature

features_and_label = training_df.columns


In [None]:
with mlflow.start_run():
    data = training_df.toPandas()[features_and_label]

    train, test = train_test_split(data, random_state=123)
    X_train = train.drop(["fare_amount"], axis=1)
    X_test = test.drop(["fare_amount"], axis=1)
    y_train = train.fare_amount
    y_test = test.fare_amount

    mlflow.lightgbm.autolog()
    train_lgb_dataset = lgb.Dataset(X_train, label=y_train.values)
    test_lgb_dataset = lgb.Dataset(X_test, label=y_test.values)

    param = {"num_leaves": 32, "objective": "regression", "metric": "rmse"}
    num_rounds = 100

    # Train a lightGBM model
    model = lgb.train(param, train_lgb_dataset, num_rounds)

    # Log the trained model with MLflow and package it with feature lookup information.
    fs.log_model(
        model,
        artifact_path="model_packaged",
        flavor=mlflow.lightgbm,
        training_set=training_set,
        registered_model_name=f"{curr_catalog}.warehouse.taxi_example_fare_time_series_packaged",
    )
