In [1]:
import os
import json
import polars as pl
from PIL import Image
from copy import deepcopy
from plotly.subplots import make_subplots

import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
from torchvision.io import decode_image
from torchvision.transforms import v2

import torchmetrics as tm
from torchinfo import summary
from lightning import Fabric

import mlflow
import tempfile
from datetime import datetime

In [2]:
# Common config/params for easy access and edit
# Optimizer, loss function, etc. will be added later
# If we declare here, the autocomplete will not work
cfg = {
    'csv_path': 'data/raw/data.csv',
    'img_dir': 'data/raw/images',
    'sample_size': 2000,
    'img_size': (128, 128),
    'seed': 1337,
    'lr': 0.001,
    'batch_size': 64,
    'epochs': 1,
    'patience': 5
}

tags = {
    'author': 'Andhika',
    'framework': 'PyTorch',
    'model': 'Simple CNN',
    'extension': 'ipynb'
}

os.environ['MLFLOW_TRACKING_USERNAME'] = 'mlflow'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'mlflow123456'

experiment_name = 'pawpaw-experiment'
registered_model_name = 'dev.pawpaw-model'

# Set MLFlow to track current experiment
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment(experiment_name)
mlflow.enable_system_metrics_logging()

# Set seed for reproducible experiment
Fabric.seed_everything(cfg['seed'])

2025/07/18 18:13:52 INFO mlflow.tracking.fluent: Experiment with name 'pawpaw-experiment' does not exist. Creating a new experiment.
Seed set to 1337


1337

In [3]:
# Use dict so we can iterate the keys if needed
# Looping variable directly will create an object copy
# By using dict keys we will get a reference instead of copy
df = {}

df_tmp = pl.read_csv(cfg['csv_path'])
df_tmp = df_tmp.sample(n = cfg['sample_size'], shuffle = True, seed = cfg['seed'])

# Use half of the samples as test data
df['test'] = df_tmp.head(int(0.5 * cfg['sample_size']))

# The other half as train and val data (80/20)
df_tmp = df_tmp.tail(len(df_tmp) - len(df['test']))
df['val'] = df_tmp.tail(int(0.2 * len(df_tmp)))
df['train'] = df_tmp.head(len(df_tmp) - len(df['val']))

del df_tmp
df['train']

Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""5a90e9cea19c754ed549f0361a788d…",0,1,1,1,0,0,0,0,0,0,0,0,22
"""fb7893daf0330ec1cebd08623b5b52…",0,1,1,1,0,0,0,0,0,0,0,0,33
"""7c2f4251a8e1b1e626f6fed0eee85f…",0,1,1,1,0,0,0,0,0,0,0,0,71
"""75d1bacdfdc827ff34dcfb809ad366…",0,1,1,1,0,0,0,0,1,0,0,0,40
"""d050e78384bd8b20e7291b3efedf6a…",0,1,1,1,0,0,0,0,0,0,0,0,57
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""f5e9a7b89f5b8e6c3619cebd0734fc…",0,0,1,1,0,0,0,0,0,0,0,0,2
"""19190da7b6c377ba0e766c021ed74c…",0,1,1,1,0,1,0,0,0,0,0,0,31
"""81b0dabf309c7152b20f24191f6b9b…",0,0,0,1,0,0,0,0,0,0,0,0,63
"""956229e9897eb031d2a91d33cb24ee…",0,1,1,1,0,0,1,0,0,0,0,0,28


In [4]:
fig = make_subplots(
    rows = 2,
    cols = 5,
    subplot_titles = df['train']['Pawpularity'][0:10]
)

for i in range(2):
    for j in range(5):
        image = df['train']['Id'][i * 5 + j]
        image = cfg['img_dir'] + f'/{image}.jpg'

        with Image.open(image) as image:
            image = image.resize(cfg['img_size'])

            fig.add_image(
                z = image,
                row = i + 1,
                col = j + 1
            )

fig.update_layout(title = 'Pawpularity Sample')
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

In [5]:
fig = make_subplots(rows = 3, cols = 1)

for i, key in enumerate(['train', 'val', 'test']):
    fig.add_histogram(
        x = df[key]['Pawpularity'],
        name = key,
        row = i + 1,
        col = 1
    )

fig.update_layout(title = 'Pawpularity Histogram')

We don't actually use test data in this notebook, but it will be used in the real pipeline later

In [6]:
class PawDataset(Dataset):
    def __init__(self, df: pl.DataFrame, img_dir: str, img_transform = None, transform = None):
        self.df = df
        self.img_dir = img_dir
        self.img_transform = img_transform
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        # Image data
        img_col = self.df['Id'][index]
        img_path = os.path.join(self.img_dir, img_col)
        img_path = os.path.abspath(img_path + '.jpg')
        image = decode_image(img_path)

        # Target (must be 2D even if only 1 column)
        target = None
        if 'Pawpularity' in self.df.columns:
            target = self.df.select(pl.col('Pawpularity') / 100)
            target = target.to_torch()[index]

        # Tabular data (the rest of the columns)
        features = self.df.select(pl.exclude('Id', 'Pawpularity'))
        features = features.to_torch()[index]

        if self.img_transform:
            image = self.img_transform(image)

        if self.transform:
            image = self.transform(image)
            features = self.transform(features)
            target = None if not target else self.transform(target)

        # Return dict instead of tuple for clarity
        return {
            'image': image,
            'features': features,
            'target': target
        }

In [7]:
loader = {}

for key in ['train', 'val', 'test']:
    # Resize all images to have the same size
    img_transform = [ v2.Resize(cfg['img_size']) ]
    # Convert all data types to the same type
    transform = [ v2.ToDtype(torch.float32) ]

    # When training, apply random transformations
    # Otherwise, leave the image untouched
    if key == 'train':
        img_transform += [
            v2.RandomChoice([
                v2.RandomAffine(
                    # 2D Rotation
                    degrees = [-180, 180],
                    # 3D rotation
                    shear = [-25, 25]
                ),
                v2.ColorJitter(
                    contrast = [0.9, 1.1],
                    saturation = [0.9, 1.1],
                    hue = [-0.1, 0.1]
                )
            ])
        ]

    # Pass the dataset to the dataloader
    loader[key] = DataLoader(
        PawDataset(
            df[key],
            img_dir = cfg['img_dir'],
            img_transform = v2.Compose(img_transform),
            transform = v2.Compose(transform)
        ),
        batch_size = cfg['batch_size'],
        shuffle = True if key == 'train' else False
    )

# Test the shape of the first batch
for ds in loader['train']:
    print('Image shape:', ds['image'].shape)
    print('Features shape:', ds['features'].shape)
    print('Target shape:', ds['target'].shape)
    break

Image shape: torch.Size([64, 3, 128, 128])
Features shape: torch.Size([64, 12])
Target shape: torch.Size([64, 1])


In [8]:
fig = make_subplots(
    rows = 2,
    cols = 5,
    subplot_titles = df['train']['Pawpularity'][0:10]
)

for i in range(2):
    for j in range(5):
        # Float type will mess up the image color
        # So we convert it back for this part only
        image = v2.functional.to_pil_image(
            v2.functional.to_dtype(
                ds['image'][i * 5 + j],
                torch.uint8
            )
        )

        fig.add_image(
            z = image,
            row = i + 1,
            col = j + 1
        )

fig.update_layout(title = 'Pawpularity Sample')
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

In [9]:
class PawModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.img_input = nn.Sequential(
            nn.LazyBatchNorm2d(),

            nn.LazyConv2d(16, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(32, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(64, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(128, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.Flatten(),
            nn.LazyLinear(128)
        )

        self.feat_input = nn.LazyLinear(128)

        self.comb_input = nn.Sequential(
            nn.LazyBatchNorm1d(),
            nn.LazyLinear(1),
            nn.Sigmoid()
        )

    def forward(self, image: Tensor, features: Tensor) -> Tensor:
        out1 = self.img_input(image)
        out2 = self.feat_input(features)

        # Combine the previous layer output
        out3 = torch.cat([out1, out2], dim = 1)
        out3 = self.comb_input(out3)

        return out3

model = PawModel()
summary(model, input_data = (ds['image'], ds['features']))

Layer (type:depth-idx)                   Output Shape              Param #
PawModel                                 [64, 1]                   --
├─Sequential: 1-1                        [64, 128]                 --
│    └─BatchNorm2d: 2-1                  [64, 3, 128, 128]         6
│    └─Conv2d: 2-2                       [64, 16, 128, 128]        448
│    └─MaxPool2d: 2-3                    [64, 16, 64, 64]          --
│    └─Conv2d: 2-4                       [64, 32, 64, 64]          4,640
│    └─MaxPool2d: 2-5                    [64, 32, 32, 32]          --
│    └─Conv2d: 2-6                       [64, 64, 32, 32]          18,496
│    └─MaxPool2d: 2-7                    [64, 64, 16, 16]          --
│    └─Conv2d: 2-8                       [64, 128, 16, 16]         73,856
│    └─MaxPool2d: 2-9                    [64, 128, 8, 8]           --
│    └─Flatten: 2-10                     [64, 8192]                --
│    └─Linear: 2-11                      [64, 128]                 1,048,7

Declare custom class to help reduce code in the training loop

In [10]:
# Taken and modified from Keras (keras.callbacks.EarlyStopping)
# The Lightning version is too complicated and incompatible with Fabric
class EarlyStopping:
    def __init__(self, monitor = 'val_loss', patience = 0, mode = 'min'):
        super().__init__()
        self.monitor = monitor
        self.patience = patience
        self.mode = mode
        self.on_train_begin()

    def on_train_begin(self):
        self.wait = 0
        self.best = None
        self.best_epoch = 0
        self.best_logs = None
        self.stop_training = False

    def on_epoch_end(self, epoch, logs: dict):
        current = logs.get(self.monitor)
        if current is None: return

        if self._is_better(current, self.best):
            self.best = current
            self.best_epoch = epoch
            self.best_logs = deepcopy(logs)
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stop_training = True

    def _is_better(self, current, best):
        if best is None: return True

        if self.mode == 'min':
            return current < best
        else:
            return current > best


# Wrap PyTorch loss function as Lightning metric
# To automatically get average loss at the end of epoch
class LossMetric(tm.Metric):
    def __init__(self, loss_cls: nn.Module, **kwargs):
        super().__init__(**kwargs)
        self.loss_fn = loss_cls(reduction = 'sum')

        self.add_state('sum_loss', default = torch.tensor(0.0), dist_reduce_fx = 'sum')
        self.add_state('total', default = torch.tensor(0), dist_reduce_fx = 'sum')

    # TODO: Should this be synced with Fabric precision setting?
    def update(self, preds: Tensor, target: Tensor) -> None:
        if preds.shape != target.shape:
            raise ValueError('Predictions and target must have the same shape')

        with torch.no_grad():
            self.sum_loss += self.loss_fn(preds, target)
            self.total += target.shape[0]

    def compute(self) -> Tensor:
        return self.sum_loss / self.total


# Quickly save/load text file
class QSave:
    @staticmethod
    def save(obj: dict | str, path: str) -> None:
        with open(path, 'w') as f:
            if type(obj) is dict:
                json.dump(obj, f, indent = 2)
            else:
                f.write(obj)

    @staticmethod
    def load(path: str) -> dict | str:
        with open(path, 'w') as f:
            if path.rsplit('.')[-1] == 'json':
                return json.load(f)
            else:
                return f.read()

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr = cfg['lr'])
criterion = torch.nn.BCELoss()

metrics = {
    'bce': LossMetric(torch.nn.BCELoss),
    'rmse': tm.MeanSquaredError(squared = False)
}

cb = {
    'early_stop': EarlyStopping(
        monitor = 'val_bce',
        patience = cfg['patience'],
        mode = 'min'
    )
}

# ----------

# Save the optimizer and criterion name to log to MLFlow later
# Fabric may change the class name later, so we save them early
cfg['optimizer'] = optimizer.__class__.__name__
cfg['criterion'] = criterion.__class__.__name__
cfg['monitor'] = 'val_bce'
cfg['monitor_min'] = True

print('MLFlow parameters:', cfg)

# ----------

# Initiate Fabric with the GPU accelerator
# Without this, we have to call "to_device" everywhere
fabric = Fabric(accelerator = 'gpu')

# Set all tensors on these objects to use GPU by wrapping them as Fabric classes
# Once wrapped, the class name and some of its properties will change too
# You can access the original class by adding ".module" or ".optimizer"
model, optimizer = fabric.setup(model, optimizer)
for key in ['train', 'val', 'test']:
    loader[key] = fabric.setup_dataloaders(loader[key])
for key in metrics.keys():
    metrics[key] = fabric.setup_module(metrics[key])

MLFlow parameters: {'csv_path': 'data/raw/data.csv', 'img_dir': 'data/raw/images', 'sample_size': 2000, 'img_size': (128, 128), 'seed': 1337, 'lr': 0.001, 'batch_size': 64, 'epochs': 1, 'patience': 5, 'optimizer': 'Adam', 'criterion': 'BCELoss', 'monitor': 'val_bce', 'monitor_min': True}


For examples on how to use Fabric on distributed setup, refer to [this link](https://lightning.ai/docs/fabric/2.5.1/examples/)

See also Fabric [methods](https://lightning.ai/docs/fabric/2.5.1/api/fabric_methods.html), [model hook](https://lightning.ai/docs/fabric/2.5.1/guide/lightning_module.html), [callbacks](https://lightning.ai/docs/fabric/2.5.1/guide/callbacks.html), [launch function](https://lightning.ai/docs/fabric/2.5.1/fundamentals/notebooks.html), and [model saving](https://lightning.ai/docs/fabric/2.5.1/guide/checkpoint/checkpoint.html). Those may be different from my current implementation

In [12]:
with mlflow.start_run() as run:
    # Logs for current and all epochs
    logs = {}
    history = {}

    # Log things that won't change
    mlflow.set_tags(tags)
    mlflow.log_params(cfg)

    # Reset early stop state
    cb['early_stop'].on_train_begin()

    # ----------

    for epoch in range(1, cfg['epochs'] + 1):
        # ----------
        # Training epoch start

        model.train()
        logs['epoch'] = epoch

        for step, ds in enumerate(loader['train']):
            preds = model(ds['image'], ds['features'])
            loss = criterion(preds, ds['target'])

            # Backward pass
            optimizer.zero_grad()
            fabric.backward(loss)
            # Update parameters (weights)
            optimizer.step()

            for name in metrics:
                metrics[name](preds, ds['target'])

        # ----------
        # Training epoch end

        for name in metrics:
            logs[name] = metrics[name].compute().item()
            metrics[name].reset()

        # ----------
        # Validation epoch start

        model.eval()

        with torch.no_grad():
            for step, ds in enumerate(loader['val']):
                preds = model(ds['image'], ds['features'])

                for name in metrics:
                    metrics[name](preds, ds['target'])

        # ----------
        # Validation epoch end

        for name in metrics:
            logs['val_' + name] = metrics[name].compute().item()
            metrics[name].reset()

        # ----------
        # Misc at the end of each epoch

        logs['epoch'] = epoch
        cb['early_stop'].on_epoch_end(epoch, logs)
        print(f'End of epoch {epoch}: {logs}')

        # Append current epoch logs to history
        for name in logs.keys():
            result = history.get(name, [])
            history[name] = result + [ logs[name] ]

        # Export current model and history
        if cb['early_stop'].best_epoch == epoch:
            print('Saving best model so far...')

            # Unwrap Fabric class as normal PyTorch class
            _model = model.module
            _optimizer = optimizer.optimizer

            model_info = mlflow.pytorch.log_model(
                _model,
                name = _model.__class__.__name__,
                step = epoch,
                conda_env = 'conda.yaml',
                signature = mlflow.models.infer_signature(
                    model_input = {
                        'image': ds['image'].numpy(force = True),
                        'features': ds['features'].numpy(force = True)
                    },
                    model_output = preds.numpy(force = True)
                )
            )

            # Local folder for temporarily storing other file artifacts
            with tempfile.TemporaryDirectory() as tmp_dir:
                torch.save(_optimizer.state_dict(), tmp_dir + '/optimizer.pth')
                QSave.save(str(_model), tmp_dir + '/model.txt')
                QSave.save(history, tmp_dir + '/history.json')

                # Copy file artifacts to MLFlow remote artifact folder
                mlflow.log_artifacts(tmp_dir)

            # If this is the best epoch, attach metrics with the model
            # This is so we can filter and compare with other models later
            mlflow.log_metrics(logs, epoch, model_id = model_info.model_id)
        else:
            # If not the best epoch, don't attach metrics to a model
            mlflow.log_metrics(logs, epoch)

        fabric.barrier()

        # Stop training if signaled by early stop
        if cb['early_stop'].stop_training:
            print('Early stopping...')
            # Append the best metrics at the end of training
            # MLFlow only shows the latest epoch metrics by default
            mlflow.log_metrics(cb['early_stop'].best_logs, epoch + 1)
            break

2025/07/18 18:13:58 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


End of epoch 1: {'epoch': 1, 'bce': 0.8488614559173584, 'rmse': 0.31418377161026, 'val_bce': 0.7829670906066895, 'val_rmse': 0.29449889063835144}
Saving best model so far...
🏃 View run thoughtful-gull-800 at: http://localhost:5000/#/experiments/1/runs/1a80b9c1ab534d468d85cc36c8a2250a
🧪 View experiment at: http://localhost:5000/#/experiments/1


2025/07/18 18:14:35 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/07/18 18:14:36 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


The pawpularity prediction is on average missed by 20 points, but whatever, I'm going to practice more on the MLOps part rather than the ML itself so we can stop here

We can now proceed to delete other models and register only the best model from the last run

In [13]:
client = mlflow.MlflowClient()
experiment_id = client.get_experiment_by_name(experiment_name)
experiment_id = experiment_id.experiment_id

In [14]:
# Sort models from the last run by the metric score
logged_models = client.search_logged_models(
    experiment_ids = [ experiment_id ],
    filter_string = f'source_run_id = \'{run.info.run_id}\'',
    order_by = [
        dict(
            field_name = f'metrics.{cfg['monitor']}',
            ascending = True if cfg['monitor_min'] else False
        )
    ]
)

# Delete every models except the best model
for i in logged_models[1:]:
    client.delete_logged_model(i.model_id)
    print('Deleted model id', i.model_id)

logged_models

[LoggedModel(artifact_location='mlflow-artifacts:/1/models/m-abee0a05537e48c984c97884ea2e7e09/artifacts', creation_timestamp=1752837258770, experiment_id='1', last_updated_timestamp=1752837271984, model_id='m-abee0a05537e48c984c97884ea2e7e09', model_type='', model_uri='models:/m-abee0a05537e48c984c97884ea2e7e09', name='PawModel', source_run_id='1a80b9c1ab534d468d85cc36c8a2250a', status=<LoggedModelStatus.READY: 'READY'>, status_message='')]

In [15]:
# Register the best model from the current run
# You can also assign model tags here if needed
mlflow.register_model(
    logged_models[0].model_uri,
    registered_model_name
)

# The metrics from the best model
cb['early_stop'].best_logs

Successfully registered model 'dev.pawpaw-model'.
2025/07/18 18:14:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dev.pawpaw-model, version 1
Created version '1' of model 'dev.pawpaw-model'.


{'epoch': 1,
 'bce': 0.8488614559173584,
 'rmse': 0.31418377161026,
 'val_bce': 0.7829670906066895,
 'val_rmse': 0.29449889063835144}

Now rank the best model across runs, to be tested with test data later

In [16]:
# TODO: Compare a few models only instead of all models
# For example, from the current best to the latest model
# This may need to rely on model tags and aliases
model_result = client.search_model_versions(
    filter_string = f'name = \'{registered_model_name}\'',
    max_results = 50
)

reg_models = []

for i in model_result:
    # TODO: Check behavior on nested runs
    run_result = client.get_run(i.run_id)

    # Values to get from the run where the model was trained
    last_metric = run_result.data.metrics.get(cfg['monitor'], None)
    duration = run_result.info.end_time - run_result.info.start_time

    # Before MLFlow v3, model is just a special kind of file artifact
    # The artifact path depends on where the user want to save it
    # Since I can't read their minds, let's just get all artifacts
    artifact_size = 0
    for j in client.list_artifacts(i.run_id):
        artifact_size += j.file_size if j.file_size else 0

    reg_models.append({
        'run_id': i.run_id,
        'duration': duration,
        'last_metric': last_metric,
        'artifact_size': artifact_size,
        'model_name': i.name,
        'model_creation_time': i.creation_timestamp,
        'model_uri': i.source,
        'model_version': i.version,
        'model_tags': i.tags,
        'model_aliases': i.aliases
    })

reg_models = pl.DataFrame(reg_models)
reg_models = reg_models.sort(
    ['last_metric', 'artifact_size', 'model_creation_time'],
    descending = [False if cfg['monitor_min'] else True, False, False],
    nulls_last = True
)

reg_models

run_id,duration,last_metric,artifact_size,model_name,model_creation_time,model_uri,model_version,model_tags,model_aliases
str,i64,f64,i64,str,i64,str,str,struct[0],list[null]
"""1a80b9c1ab534d468d85cc36c8a225…",37698,0.782967,9205833,"""dev.pawpaw-model""",1752837277992,"""models:/m-abee0a05537e48c984c9…","""1""",{},[]


If we already have the `best` model version registered previously, we don't need to test models older than that

This will reduce the number of models to test, but may not be the right thing to do in some cases

In [17]:
try:
    current_best = client.get_model_version_by_alias(
        registered_model_name,
        alias = 'best'
    )
except mlflow.exceptions.RestException as e:
    if e.error_code == 'INVALID_PARAMETER_VALUE':
        current_best = None

if current_best:
    # Don't use models older than the current best model
    reg_models = reg_models.filter(
        pl.col('model_creation_time') >= current_best.creation_timestamp
    )

reg_models

run_id,duration,last_metric,artifact_size,model_name,model_creation_time,model_uri,model_version,model_tags,model_aliases
str,i64,f64,i64,str,i64,str,str,struct[0],list[null]
"""1a80b9c1ab534d468d85cc36c8a225…",37698,0.782967,9205833,"""dev.pawpaw-model""",1752837277992,"""models:/m-abee0a05537e48c984c9…","""1""",{},[]


Test the models one by one, and mark the best model version with the `best` alias

In [18]:
test_results = {
    'version': [],
    'metric': []
}

# TODO: Parallelization for faster model testing
# This may backfire if we don't have enough resources
for i in reg_models.iter_rows(named = True):
    print(f'Testing {i['model_uri']} (v{i['model_version']})...')

    try:
        # TODO: Use pyfunc to support all MLFlow model flavours
        # The model signature needs to be changed to Pandas/Numpy
        model = mlflow.pytorch.load_model(i['model_uri'])
        metric = tm.MeanSquaredError(squared = False)
    except ModuleNotFoundError as e:
        print(f'Skipping test, missing dependency: {e.msg}')
        continue

    # Synchronize device to use (e.g. GPU)
    model = fabric.setup_module(model)
    metric = fabric.setup_module(metric)

    for step, ds in enumerate(loader['test']):
        preds = model(ds['image'], ds['features'])
        loss = metric(preds, ds['target'])

    avg_loss = metric.compute().item()
    print(f'Got average metric score of {avg_loss}')
    metric.reset()

    test_results['version'].append(i['model_version'])
    test_results['metric'].append(avg_loss)

best_version = test_results['metric'].index(
    min(test_results['metric']) if cfg['monitor_min']
    else max(test_results['metric'])
)

best_version = test_results['version'][best_version]
best_version

Testing models:/m-abee0a05537e48c984c97884ea2e7e09 (v1)...



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00,  8.09it/s] 


Got average metric score of 0.3163958191871643


'1'

In [23]:
if (not current_best) or current_best.version != best_version:
    # Give the "best" alias to a specific model version
    client.set_registered_model_alias(
        registered_model_name,
        alias = 'best',
        version = best_version
    )

    # Set tag when the model was marked as the best
    client.set_model_version_tag(
        registered_model_name,
        version = best_version,
        key = 'best',
        value = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    )

# Check if the new alias is registered properly
client.get_model_version_by_alias(registered_model_name, 'best')

<ModelVersion: aliases=['best'], creation_timestamp=1752837277992, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1752837277992, metrics=None, model_id=None, name='dev.pawpaw-model', params=None, run_id='1a80b9c1ab534d468d85cc36c8a2250a', run_link='', source='models:/m-abee0a05537e48c984c97884ea2e7e09', status='READY', status_message=None, tags={'best': '2025-07-18 19:06:09'}, user_id='', version='1'>