In [10]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

In [11]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.search_experiments()

[<Experiment: artifact_location='file:///d:/KMUTT/3rd year/MLops_Class/cpe393-mlflow/mlruns/1', creation_time=1742481363260, experiment_id='1', last_update_time=1742481363260, lifecycle_stage='active', name='my-new-experiment', tags={}>,
 <Experiment: artifact_location='file:///d:/KMUTT/3rd year/MLops_Class/cpe393-mlflow/mlruns/0', creation_time=1742481317599, experiment_id='0', last_update_time=1742481317599, lifecycle_stage='active', name='Default', tags={}>]

In [12]:
client.create_experiment(name="my-new-experiment")

MlflowException: Experiment(name=my-new-experiment) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(sqlite3.IntegrityError) UNIQUE constraint failed: experiments.name
[SQL: INSERT INTO experiments (name, artifact_location, lifecycle_stage, creation_time, last_update_time) VALUES (?, ?, ?, ?, ?)]
[parameters: ('my-new-experiment', None, 'active', 1742481533675, 1742481533675)]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

mlflow.set_experiment("my-new-experiment")

with mlflow.start_run():

    X, y = load_iris(return_X_y=True)

    params = {"C": 0.1, "random_state": 42}
    mlflow.log_params(params)

    lr = LogisticRegression(**params).fit(X, y)
    y_pred = lr.predict(X)
    mlflow.log_metric("accuracy", accuracy_score(y, y_pred))

    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

In [13]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 100",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [14]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

In [15]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [17]:
run_id = "1"  #insert your run id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...


MlflowException: Run with id=1 not found

In [None]:
#check model version and stages
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [None]:
model_version = 4
new_stage = "Staging"
# Use client.transition_model_version_stage() function to model to staging.

Comparing versions and selecting the new "Production" model
In the last section, we will retrieve models registered in the model registry and compare their performance on an unseen test set. The idea is to simulate the scenario in which a deployment engineer has to interact with the model registry to decide whether to update the model version that is in production or not.

These are the steps:

Load the test dataset, which corresponds to the NYC Green Taxi data from the month of March 2021.
Download the DictVectorizer that was fitted using the training data and saved to MLflow as an artifact, and load it with pickle.
Preprocess the test set using the DictVectorizer so we can properly feed the regressors.
Make predictions on the test set using the model versions that are currently in the "Staging" and "Production" stages, and compare their performance.
Based on the results, update the "Production" model version accordingly.


# test model

In [19]:
import pandas as pd

In [20]:
test_data_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2021-03.csv.gz"
df_test = pd.read_csv(test_data_url)

  df_test = pd.read_csv(test_data_url)


In [21]:
df_test

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2.0,2021-03-01 00:05:42,2021-03-01 00:14:03,N,1.0,83,129,1.0,1.56,7.50,0.50,0.5,0.00,0.00,,0.3,8.80,1.0,1.0,0.0
1,2.0,2021-03-01 00:21:03,2021-03-01 00:26:17,N,1.0,243,235,1.0,0.96,6.00,0.50,0.5,0.00,0.00,,0.3,7.30,2.0,1.0,0.0
2,2.0,2021-03-01 00:02:06,2021-03-01 00:22:26,N,1.0,75,242,1.0,9.93,28.00,0.50,0.5,2.00,0.00,,0.3,31.30,1.0,1.0,0.0
3,2.0,2021-03-01 00:24:03,2021-03-01 00:31:43,N,1.0,242,208,1.0,2.57,9.50,0.50,0.5,0.00,0.00,,0.3,10.80,2.0,1.0,0.0
4,1.0,2021-03-01 00:11:10,2021-03-01 00:14:46,N,1.0,41,151,1.0,0.80,5.00,0.50,0.5,1.85,0.00,,0.3,8.15,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83822,,2021-03-18 13:00:00,2021-03-18 13:41:00,,,247,77,,18.32,38.20,5.50,0.0,0.00,6.12,,0.3,50.12,,,
83823,,2021-03-18 13:02:00,2021-03-18 13:33:00,,,85,37,,4.23,25.95,2.75,0.0,0.00,0.00,,0.3,29.00,,,
83824,,2021-03-18 13:07:00,2021-03-18 13:34:00,,,209,42,,10.03,41.11,2.75,0.0,0.00,0.00,,0.3,44.16,,,
83825,,2021-03-18 13:59:00,2021-03-18 14:06:00,,,42,74,,0.84,11.95,2.75,0.0,0.00,0.00,,0.3,15.00,,,
