In [1]:
import os
import mlflow

from mlflow.tracking import MlflowClient
from pprint import pprint

#### Question 1: What's the (MLflow) version that you have?

In [2]:
mlflow.__version__

'2.13.0'

In [3]:
!mkdir -p data/raw/

# List of URLs to download
urls = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"
]

# Download each file to the specified directory if it doesn't already exist
for url in urls:
    filename = os.path.join("data/raw/", os.path.basename(url))
    if not os.path.exists(filename):
        !wget -P data/raw/ {url}
    else:
        print(f"{filename} already exists.")

data/raw/green_tripdata_2023-01.parquet already exists.
data/raw/green_tripdata_2023-02.parquet already exists.
data/raw/green_tripdata_2023-03.parquet already exists.


In [4]:
!python homework_scripts/original/preprocess_data.py --raw_data_path ./data/raw --dest_path ./data/preprocessed

#### Question 2: How many files were saved to OUTPUT_FOLDER?

In [5]:
files_and_dirs = os.listdir('./data/preprocessed')
    
files = [f for f in files_and_dirs if os.path.isfile(os.path.join('./data/preprocessed', f))]
len(files)

4

In [6]:
!python homework_scripts/edited/train.py --data_path ./data/preprocessed

2024/05/23 23:06:54 INFO mlflow.tracking.fluent: Experiment with name 'Homework 2' does not exist. Creating a new experiment.


In [7]:
client = MlflowClient()

In [8]:
experiment = client.get_experiment_by_name(name="Homework 2")

run = client.search_runs(experiment_ids=[experiment.experiment_id])[0]
#pprint(run.to_dictionary(), indent=2)

#### Question 3: What is the value of the min_samples_split parameter?

In [9]:
run.data.params['min_samples_split']

'2'

#### Question 4: In addition to backend-store-uri, what else do you need to pass to properly configure the server?

1. default-artifact-root
1. serve-artifacts
1. artifacts-only
1. artifacts-destination

**Answer**: to properly configure and launch the MLflow tracking server with a SQLite backend and a specified folder for the artifact store, you need to pass the following parameters in addition to `backend-store-uri`:

    mlflow server \
        --backend-store-uri sqlite:///mlflow.db \
        --default-artifact-root ./artifacts \
        --host 0.0.0.0

- **`--backend-store-uri`** sqlite:///mlflow.db specifies the SQLite database for the backend store.
- **`--default-artifact-root`** ./artifacts specifies the folder called artifacts for the artifact store.
- **`--host 0.0.0.0`** allows access from any IP address.

If you do not set these up explicitly, MLflow uses default behaviors. You can inspect the default URIs using the following methods:

- Run `mlflow.get_artifact_uri()` to get the default artifact URI.

In [10]:
mlflow.get_tracking_uri()

'file:///home/ubuntu/datatalks-mlops-zoomcamp/02-exp-tracking/mlruns'

- Run `mlflow.get_tracking_uri()` to get the default tracking URI:

In [11]:
mlflow.get_artifact_uri()

'file:///home/ubuntu/datatalks-mlops-zoomcamp/02-exp-tracking/mlruns/0/1e1d3a35043440e3b9fe78d2687c4562/artifacts'

- Additionally, you can inspect the artifact URI from a `mlflow.entities.run.Run` object with `run.info.artifact_uri`:

In [12]:
run.info.artifact_uri

'file:///home/ubuntu/datatalks-mlops-zoomcamp/02-exp-tracking/mlruns/393138072735723419/22026f469fa84e87b2dd8472159441a4/artifacts'

As you can see, if you don't set a `backend-store-uri`, MLflow uses the `./mlruns` directory by default.

In [13]:
!python homework_scripts/edited/hpo.py --data_path ./data/preprocessed

2024/05/23 23:07:18 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.
100%|██████████| 15/15 [02:03<00:00,  8.22s/trial, best loss: 5.335419588556921]


The hyperopt script is already telling us the answer (5.335). However, let's find it out using mlflow.

In [14]:
experiment = mlflow.get_experiment_by_name('random-forest-hyperopt')
metric_name = 'rmse'

runs = mlflow.search_runs(
    experiment_ids=experiment.experiment_id,
    order_by=[f"metrics.{metric_name} ASC"],
    max_results=15
)

best_run = runs.iloc[0]
print(f"{best_run['experiment_id']} - {best_run['run_id']} - {best_run['tags.mlflow.runName']}")

886471705322384834 - 20c1be0d37584bb3a58ae54a02ca2ceb - persistent-whale-928


#### Question 5: what's the best validation RMSE that you got?

In [15]:
best_run['metrics.rmse']

5.335419588556921

In [16]:
!python homework_scripts/edited/register_model.py --data_path ./data/preprocessed

2024/05/23 23:09:23 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.


In [17]:
for mv in client.search_model_versions("name='random-forest-reg-model'"):
    pprint(dict(mv), indent=2)

{ 'aliases': [],
  'creation_timestamp': 1716505820381,
  'current_stage': 'None',
  'description': None,
  'last_updated_timestamp': 1716505820381,
  'name': 'random-forest-reg-model',
  'run_id': '05f10ac6db674545b7082d5508e8942c',
  'run_link': None,
  'source': 'mlruns/680284922997686276/05f10ac6db674545b7082d5508e8942c/artifacts/model',
  'status': 'READY',
  'status_message': None,
  'tags': {},
  'user_id': None,
  'version': 1}


#### Question 6: what is the test RMSE of the best model?

In [18]:
run = client.get_run('4e89e7da25c4497ea0f236945832b37c')
run.data.metrics

{'test_rmse': 5.5941605655803635,
 'val_rmse': 5.3633599989832135,
 'training_mean_absolute_error': 3.323916924052877,
 'training_root_mean_squared_error': 5.107146456952711,
 'training_score': 0.6796805248104354,
 'training_mean_squared_error': 26.08294493276463,
 'training_r2_score': 0.6796805248104354}