# Homework solution - 02 experiment tracking

In this notebook are the answers of the homework of the module 02 of the course (Experiment Tracking).

In [8]:
# Built-in imports
import os

# External imports
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

# Own imports
from scripts import get_path_dir as gpd

# Define the path to the data directory
DATA_DIR = gpd.get_desired_folder_path("data")

## Q1. Install MLflow

In [2]:
mlflow_version = os.popen('mlflow --version')
print(f"The version of MLflow that i installed is the following: {mlflow_version.read()}")

The version of MLflow that i installed is the following: mlflow, version 1.26.1



## Q2. Download and preprocess the data

In [3]:
# Run the <preprocess_pipeline.py> file in order to preprocess the datasets
_ = os.system(f'python ../scripts/preprocess_pipeline.py --raw_data_path "{DATA_DIR}" --dest_path ../artifacts' )

In [7]:
artifacts_dir_ls = os.popen('ls ../artifacts/ | wc -l')
print(f"The number of resulted files is: {int(artifacts_dir_ls.read()) - 1}")

The number of resulted files is: 4


## Q3. Train a model with autolog

In [11]:
# Run the <train.py> file in order to train a simple random forest model to predict the time duration of a taxi ride
_ = os.system(f'python ../scripts/train.py --data_path ../artifacts')

In [26]:
# Define the MLflow client API
client = MlflowClient()

# Get the run_id os the first run
runs = client.search_runs(
    experiment_ids='0',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    order_by=["attribute.start_time ASC"]
)
id_first_run = runs[0].info.run_id

In [27]:
params_dir_ls = os.popen(f'ls mlruns/0/{id_first_run}/params/ | wc -l')
print(f"The number of logged params is: {int(params_dir_ls.read())}")

The number of logged params is: 17


## Q4. Launch the tracking server locally

In addition to `backend-store-uri`, it is important to use the `default-artifact-root` param too.

## Q5. Tune the hyperparameters of the model

In [28]:
# Run the <hpo.py> file in order to make a hyperparameter optimization of the Random Forest model
_ = os.system(f'python ../scripts/hpo.py --data_path ../artifacts')

2022/05/30 18:57:37 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


100%|██████████| 50/50 [06:23<00:00,  7.67s/trial, best loss: 6.6284257482044735]


In [38]:
# Define important constant variables
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"

# Define the MLflow client API
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Get the run with the lowest rsme
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)

lowest_rmse = runs[0].data.metrics['rmse']

In [37]:
print(f"The lowest value of rmse got is: {lowest_rmse:.4f}")

The lowest value of rmse got is: 6.6284


## Q6. Promote the best model to the model registry

In [40]:
# Run the <register_model.py> file in order to registry the best model in the model registry
_ = os.system(f'python ../scripts/register_model.py --data_path ../artifacts')

Registered model 'nyc-taxi-random-forest' already exists. Creating a new version of this model...
2022/05/30 19:41:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyc-taxi-random-forest, version 2
Created version '2' of model 'nyc-taxi-random-forest'.


In [41]:
# Define important constant variables
HPO_EXPERIMENT_NAME = "random-forest-best-models"

# Get the run with the lowest test_rsme
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)

lowest_test_rmse = runs[0].data.metrics['test_rmse']

In [42]:
print(f"The lowest value of test rmse got is: {lowest_test_rmse:.4f}")

The lowest value of test rmse got is: 6.5489
