In [1]:
import os
import json
import polars as pl
from PIL import Image
from copy import deepcopy
from plotly.subplots import make_subplots

import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
from torchvision.io import decode_image
from torchvision.transforms import v2

import torchmetrics as tm
from torchinfo import summary
from lightning.fabric import Fabric

import mlflow

In [2]:
# Common config/params for easy access and edit
# Optimizer, loss function, etc. will be added later
# If we declare here, the autocomplete will not work
cfg = {
    'data_dir': '../data',
    'model_dir': 'model',
    'img_size': (128, 128),
    'seed': 1337,
    'lr': 0.001,
    'batch_size': 64,
    'epochs': 20,
    'monitor': 'val_bce',
    'patience': 5
}

tags = {
    'developer': 'Andhika',
    'model': 'PyTorch',
    'format': 'ipynb',
    'type': 'CNN'
}

# Set MLFlow to track current experiment
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('pawpaw-experiment')
mlflow.enable_system_metrics_logging()

2025/06/07 21:08:12 INFO mlflow.tracking.fluent: Experiment with name 'pawpaw-experiment' does not exist. Creating a new experiment.


In [3]:
# Use dict so we can iterate the keys if needed
# Looping variable directly will create an object copy
# By using dict keys we will get a reference instead of copy
df = {}

# Take only 1000 sample images
df['train'] = pl.read_csv(f'{cfg['data_dir']}/train.csv')
df['train'] = df['train'].sample(n = 1000, shuffle = True, seed = cfg['seed'])

# Train-validation split (800/200)
df['val'] = df['train'].tail(200)
df['train'] = df['train'].head(800)

df['train']

Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""f35cd61df35c10d103128cedd62939…",0,1,1,1,0,0,0,0,0,0,0,0,25
"""ba0fcc8a37286942e2c8b371242191…",0,1,1,1,0,0,0,0,1,1,0,0,40
"""d1c4f27d9f606497f90d0770fcb006…",0,1,1,1,0,0,0,0,0,0,0,0,38
"""6d66abfff024243564764a27cb8258…",0,1,1,1,0,1,0,0,0,0,0,0,49
"""76b49ec9e93679193ead2bc697978a…",0,1,1,1,0,0,0,0,0,0,0,0,40
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ba72b0442c45364dfa3ce88c4ee8cf…",0,0,1,0,0,0,1,1,0,1,1,0,60
"""07ed3551e1cdc7ccf14990420e1120…",0,1,1,1,0,0,0,0,0,0,0,0,21
"""93668760e9a4c9438be39119dd9525…",0,1,1,1,0,0,0,0,0,0,0,0,33
"""e8e745df7d70fac506a0eb1e25cac0…",0,0,1,1,0,0,0,0,0,0,0,0,30


In [4]:
fig = make_subplots(
    rows = 2,
    cols = 5,
    subplot_titles = df['train']['Pawpularity'][0:10]
)

for i in range(2):
    for j in range(5):
        image = df['train']['Id'][i * 5 + j]
        image = f'{cfg['data_dir']}/train/' + image + '.jpg'

        with Image.open(image) as image:
            image = image.resize(cfg['img_size'])

            fig.add_image(
                z = image,
                row = i + 1,
                col = j + 1
            )

fig.update_layout(title = 'Pawpularity Sample')
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

In [5]:
fig = make_subplots(rows = 2, cols = 1)

for i, key in enumerate(['train', 'val']):
    fig.add_histogram(
        x = df[key]['Pawpularity'],
        name = key,
        row = i + 1,
        col = 1
    )

fig.update_layout(title = 'Pawpularity Histogram')

In [6]:
class PawDataset(Dataset):
    def __init__(self, df: pl.DataFrame, img_dir: str, img_transform = None, transform = None):
        self.df = df
        self.img_dir = img_dir
        self.img_transform = img_transform
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        # Image data
        img_col = self.df['Id'][index]
        img_path = os.path.join(self.img_dir, img_col)
        img_path = os.path.abspath(img_path + '.jpg')
        image = decode_image(img_path)

        # Target (must be 2D even if there's only 1 column)
        target = self.df.select(pl.col('Pawpularity') / 100)
        target = target.to_torch()[index]

        # Tabular data (the rest of the columns)
        features = self.df.select(pl.exclude('Id', 'Pawpularity'))
        features = features.to_torch()[index]

        if self.img_transform:
            image = self.img_transform(image)

        if self.transform:
            image = self.transform(image)
            features = self.transform(features)
            target = self.transform(target)

        # Return dict instead of tuple for clarity
        return {
            'image': image,
            'features': features,
            'target': target
        }

In [7]:
loader = {}

for key in ['train', 'val']:
    # Resize all images to have the same size
    img_transform = [ v2.Resize(cfg['img_size']) ]
    # Convert all data types to have the same type
    transform = [ v2.ToDtype(torch.float32) ]

    # When training, apply random transformations
    # Otherwise, leave the image untouched
    if key == 'train':
        img_transform += [
            v2.RandomChoice([
                v2.RandomAffine(
                    # 2D Rotation
                    degrees = [-180, 180],
                    # 3D rotation
                    shear = [-25, 25]
                ),
                v2.ColorJitter(
                    contrast = [0.9, 1.1],
                    saturation = [0.9, 1.1],
                    hue = [-0.1, 0.1]
                )
            ])
        ]

    # Pass the dataset to the dataloader
    loader[key] = DataLoader(
        PawDataset(
            df[key],
            img_dir = f'{cfg['data_dir']}/train',
            img_transform = v2.Compose(img_transform),
            transform = v2.Compose(transform)
        ),
        batch_size = cfg['batch_size'],
        shuffle = True
    )

# Test the shape of the first batch
for ds in loader['train']:
    print('Image shape:', ds['image'].shape)
    print('Features shape:', ds['features'].shape)
    print('Target shape:', ds['target'].shape)
    break

Image shape: torch.Size([64, 3, 128, 128])
Features shape: torch.Size([64, 12])
Target shape: torch.Size([64, 1])


In [8]:
fig = make_subplots(
    rows = 2,
    cols = 5,
    subplot_titles = df['train']['Pawpularity'][0:10]
)

for i in range(2):
    for j in range(5):
        # Float type will mess up the image color
        # So we convert it back for this part only
        image = v2.functional.to_pil_image(
            v2.functional.to_dtype(
                ds['image'][i * 5 + j],
                torch.uint8
            )
        )

        fig.add_image(
            z = image,
            row = i + 1,
            col = j + 1
        )

fig.update_layout(title = 'Pawpularity Sample')
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

fig.show()

In [9]:
class PawModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.img_input = nn.Sequential(
            nn.LazyBatchNorm2d(),

            nn.LazyConv2d(16, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(32, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(64, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.LazyConv2d(128, 3, padding = 'same'),
            nn.MaxPool2d(2),

            nn.Flatten(),
            nn.LazyLinear(128)
        )

        self.feat_input = nn.LazyLinear(128)

        self.comb_input = nn.Sequential(
            nn.LazyBatchNorm1d(),
            nn.LazyLinear(1),
            nn.Sigmoid()
        )

    def forward(self, img_inputs: Tensor, feat_inputs: Tensor) -> Tensor:
        out1 = self.img_input(img_inputs)
        out2 = self.feat_input(feat_inputs)

        # Combine the previous layer output
        out3 = torch.cat([out1, out2], dim = 1)
        out3 = self.comb_input(out3)

        return out3

model = PawModel()
summary(model, input_data = (ds['image'], ds['features']))

Layer (type:depth-idx)                   Output Shape              Param #
PawModel                                 [64, 1]                   --
├─Sequential: 1-1                        [64, 128]                 --
│    └─BatchNorm2d: 2-1                  [64, 3, 128, 128]         6
│    └─Conv2d: 2-2                       [64, 16, 128, 128]        448
│    └─MaxPool2d: 2-3                    [64, 16, 64, 64]          --
│    └─Conv2d: 2-4                       [64, 32, 64, 64]          4,640
│    └─MaxPool2d: 2-5                    [64, 32, 32, 32]          --
│    └─Conv2d: 2-6                       [64, 64, 32, 32]          18,496
│    └─MaxPool2d: 2-7                    [64, 64, 16, 16]          --
│    └─Conv2d: 2-8                       [64, 128, 16, 16]         73,856
│    └─MaxPool2d: 2-9                    [64, 128, 8, 8]           --
│    └─Flatten: 2-10                     [64, 8192]                --
│    └─Linear: 2-11                      [64, 128]                 1,048,7

Also save information below for easy model overview later

In [10]:
# Model input and output signature
signature = mlflow.models.infer_signature(
    {
        'img_inputs': ds['image'].detach().numpy(),
        'feat_inputs': ds['features'].detach().numpy()
    },
    model(ds['image'], ds['features']).detach().numpy(),
)

# Model layer information
model_str = str(model)
print(model_str)

PawModel(
  (img_input): Sequential(
    (0): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Flatten(start_dim=1, end_dim=-1)
    (10): Linear(in_features=8192, out_features=128, bias=True)
  )
  (feat_input): Linear(in_features=12, out_features=128, bias=True)
  (comb_input): Sequential(
    (0): BatchNorm1d(256, eps=1e-05, mome

Declare custom class to help reduce code in the training loop

In [11]:
# Taken and modified from Keras (keras.callbacks.EarlyStopping)
# The Lightning version is too complicated and incompatible with Fabric
class EarlyStopping:
    def __init__(self, monitor = 'val_loss', patience = 0, mode = 'min'):
        super().__init__()
        self.monitor = monitor
        self.patience = patience
        self.mode = mode
        self.on_train_begin()

    def on_train_begin(self):
        self.wait = 0
        self.best = None
        self.best_epoch = 0
        self.best_logs = None
        self.stop_training = False

    def on_epoch_end(self, epoch, logs: dict):
        current = logs.get(self.monitor)
        if current is None: return

        if self._is_better(current, self.best):
            self.best = current
            self.best_epoch = epoch
            self.best_logs = deepcopy(logs)
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stop_training = True

    def _is_better(self, current, best):
        if best is None: return True

        if self.mode == 'min':
            return current < best
        else:
            return current > best


# Wrap PyTorch loss function as Lightning metric
# To automatically get average loss at the end of epoch
class LossWrapper(tm.Metric):
    def __init__(self, loss_cls: nn.Module, **kwargs):
        super().__init__(**kwargs)
        self.loss_fn = loss_cls(reduction = 'sum')

        self.add_state('sum_loss', default = torch.tensor(0.0), dist_reduce_fx = 'sum')
        self.add_state('total', default = torch.tensor(0), dist_reduce_fx = 'sum')

    # TODO: Sync with Fabric precision setting?
    def update(self, preds: Tensor, target: Tensor) -> None:
        if preds.shape != target.shape:
            raise ValueError("Predictions and target must have the same shape")

        with torch.no_grad():
            self.sum_loss += self.loss_fn(preds, target)
            self.total += target.shape[0]

    def compute(self) -> Tensor:
        return self.sum_loss / self.total


# Quickly save/load text file
class QSave:
    @staticmethod
    def save(obj: dict | str, path: str) -> None:
        with open(path, 'w') as f:
            if type(obj) == dict:
                json.dump(obj, f, indent = 2)
            else:
                f.write(obj)

    @staticmethod
    def load(path: str) -> dict | str:
        with open(path, 'w') as f:
            if path.rsplit('.')[-1] == 'json':
                return json.load(f)
            else:
                return f.read()

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr = cfg['lr'])
criterion = torch.nn.BCELoss()

metrics = {
    'bce': LossWrapper(torch.nn.BCELoss),
    'rmse': tm.MeanSquaredError(squared = False)
}

cb = {
    'early_stop': EarlyStopping(
        monitor = cfg['monitor'],
        patience = cfg['patience'],
        mode = 'min'
    )
}

# ----------

# Save optimizer and criterion name to config
# Should be done before passing it to Fabric
cfg['optimizer'] = optimizer.__class__.__name__
cfg['criterion'] = criterion.__class__.__name__
print(cfg)

# ----------

# Initiate Fabric to move all tensors to GPU
# Without having to call "to_device" everywhere
fabric = Fabric(accelerator = 'gpu')
fabric.seed_everything(cfg['seed'])

model, optimizer = fabric.setup(model, optimizer)
for key in ['train', 'val']:
    loader[key] = fabric.setup_dataloaders(loader[key])
for key in metrics.keys():
    metrics[key] = fabric.setup_module(metrics[key])

{'data_dir': '../data', 'model_dir': 'model', 'img_size': (128, 128), 'seed': 1337, 'lr': 0.001, 'batch_size': 64, 'epochs': 20, 'monitor': 'val_bce', 'patience': 5, 'optimizer': 'Adam', 'criterion': 'BCELoss'}


Seed set to 1337


For examples on how to use Fabric on distributed setup, refer to [this link](https://lightning.ai/docs/fabric/2.5.1/examples/)

See also Fabric [methods](https://lightning.ai/docs/fabric/2.5.1/api/fabric_methods.html), [model hook](https://lightning.ai/docs/fabric/2.5.1/guide/lightning_module.html), [callbacks](https://lightning.ai/docs/fabric/2.5.1/guide/callbacks.html), [launch function](https://lightning.ai/docs/fabric/2.5.1/fundamentals/notebooks.html), and [model saving](https://lightning.ai/docs/fabric/2.5.1/guide/checkpoint/checkpoint.html). Those may be different from my current implementation

In [13]:
with mlflow.start_run():
    # Logs for current and all epochs
    logs = {}
    history = {}

    # Log things that won't change
    mlflow.set_tags(tags)
    mlflow.log_params(cfg)

    # Reset early stop state
    cb['early_stop'].on_train_begin()

    # ----------

    for epoch in range(1, cfg['epochs'] + 1):
        # ----------
        # Training epoch start

        model.train()

        for step, ds in enumerate(loader['train']):
            preds = model(ds['image'], ds['features'])
            loss = criterion(preds, ds['target'])

            # Backward pass
            optimizer.zero_grad()
            fabric.backward(loss)
            # Update parameters (weights)
            optimizer.step()

            for name in metrics:
                metrics[name](preds, ds['target'])

        # ----------
        # Training epoch end

        for name in metrics:
            logs[name] = metrics[name].compute().item()
            metrics[name].reset()

        # ----------
        # Validation epoch start

        model.eval()

        with torch.no_grad():
            for step, ds in enumerate(loader['val']):
                preds = model(ds['image'], ds['features'])

                for name in metrics:
                    metrics[name](preds, ds['target'])

        # ----------
        # Validation epoch end

        for name in metrics:
            logs['val_' + name] = metrics[name].compute().item()
            metrics[name].reset()

        cb['early_stop'].on_epoch_end(epoch, logs)

        # ----------
        # Misc at the end of each epoch

        logs['epoch'] = epoch
        print(f'End of epoch {epoch}: {logs}')

        # Append current epoch logs to history
        for name in logs.keys():
            result = history.get(name, [])
            history[name] = result + [ logs[name] ]

        # Export best model and history
        if cb['early_stop'].best_epoch == epoch:
            print('Saving best model so far...')

            mlflow.pytorch.log_model(
                model,
                artifact_path = cfg['model_dir'],
                conda_env = 'conda.yaml',
                signature = signature
            )

            # MLFlow artifacts are stored on a different base directory
            # So the paths below are not the same dir as the artifact path above
            torch.save(optimizer.state_dict(), cfg['model_dir'] + '/optimizer.pth')
            QSave.save(model_str, cfg['model_dir'] + '/model.txt')
            QSave.save(history, cfg['model_dir'] + '/history.json')
            # This will copy above files to artifacts folder
            mlflow.log_artifacts(cfg['model_dir'])

            fabric.barrier()

        # Log things that may change on each epoch
        mlflow.log_metrics(logs, epoch)

        # Stop training if signaled by early stop
        if cb['early_stop'].stop_training:
            print(f'Early stopping...')
            # Append best metrics at the end of log
            mlflow.log_metrics(
                cb['early_stop'].best_logs,
                epoch + 1
            )
            break

2025/06/07 21:08:18 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


End of epoch 1: {'bce': 0.8206877708435059, 'rmse': 0.298488050699234, 'val_bce': 1.1306273937225342, 'val_rmse': 0.4544859826564789, 'epoch': 1}
Saving best model so far...
End of epoch 2: {'bce': 0.6872490048408508, 'rmse': 0.23157314956188202, 'val_bce': 0.6766864061355591, 'val_rmse': 0.2227381020784378, 'epoch': 2}
Saving best model so far...
End of epoch 3: {'bce': 0.6732938289642334, 'rmse': 0.2162049561738968, 'val_bce': 0.670172929763794, 'val_rmse': 0.21593600511550903, 'epoch': 3}
Saving best model so far...
End of epoch 4: {'bce': 0.668694257736206, 'rmse': 0.2108852118253708, 'val_bce': 0.6652567982673645, 'val_rmse': 0.21022747457027435, 'epoch': 4}
Saving best model so far...
End of epoch 5: {'bce': 0.6682420372962952, 'rmse': 0.2103419005870819, 'val_bce': 0.6621376276016235, 'val_rmse': 0.20659972727298737, 'epoch': 5}
Saving best model so far...
End of epoch 6: {'bce': 0.6646311283111572, 'rmse': 0.20608779788017273, 'val_bce': 0.6626778244972229, 'val_rmse': 0.207224

2025/06/07 21:12:22 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/06/07 21:12:22 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


End of epoch 14: {'bce': 0.6647847890853882, 'rmse': 0.20626235008239746, 'val_bce': 0.6628236174583435, 'val_rmse': 0.2073042392730713, 'epoch': 14}
Early stopping...
🏃 View run gregarious-frog-659 at: http://localhost:5000/#/experiments/602131988262011820/runs/c9b5c79adc8045e3901fd9211a8f4804
🧪 View experiment at: http://localhost:5000/#/experiments/602131988262011820


The pawpularity prediction is on average missed by 20 points, but whatever, I'm going to practice more on the MLOps part rather than the ML itself so we can stop here

In [14]:
cb['early_stop'].best_logs

{'bce': 0.6648518443107605,
 'rmse': 0.20626994967460632,
 'val_bce': 0.6601429581642151,
 'val_rmse': 0.20429538190364838,
 'epoch': 8}