# Setup MLflow tracking

In [1]:
import mlflow

mlflow.set_tracking_uri("http://192.168.100.37:5000")
mlflow.set_experiment('fashion-mnist')          # set the experiment
mlflow.pytorch.autolog()

print(mlflow.__version__)
print(mlflow.get_tracking_uri())



2.4.1
http://192.168.100.37:5000


# Setup Pytorch

Libraries and DataLoaders

In [2]:
import os
import torch
import pytorch_lightning as pl
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch import nn
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchvision import transforms
from torchmetrics import Accuracy

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" # GPU for MacOS devices -> which means... garbage
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

transform = transforms.ToPILImage()

Using cuda device


## Download Datasets

In [4]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [5]:
batch_size = 64
n_epochs = 15

# Create data loaders.
# os.cpu_count()
train_dataloader = DataLoader(training_data, batch_size=batch_size, num_workers=0)
test_dataloader = DataLoader(test_data, batch_size=batch_size, num_workers=0)

print(f"Training dataset contains: {len(training_data)} sample images")
print(f"Test dataset contains: {len(test_data)} sample images")
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    _, _, HEIGHT, WIDTH = X.shape
    break

Training dataset contains: 60000 sample images
Test dataset contains: 10000 sample images


Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [6]:
num_classes = len(training_data.classes)
print(f"There are {num_classes} classes in training data")

There are 10 classes in training data


# Model Definition and Training

In [7]:
class NeuralNetwork(pl.LightningModule):
    def __init__(self, num_outputs):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=HEIGHT*WIDTH, out_features=num_outputs)
        )
        self.loss_fn = nn.CrossEntropyLoss()
        self.acc_fn = Accuracy(task="multiclass", num_classes=num_outputs)

    def forward(self, x):
        return F.log_softmax(self.net(x), dim=1)
    
    def training_step(self, batch, batch_nb):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        pred = logits.argmax(dim=1)
        acc = self.acc_fn(pred, y)
        
        self.log("train_loss", loss, on_epoch=True)
        self.log("train_acc", acc, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_nb):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        pred = logits.argmax(dim=1)
        acc = self.acc_fn(pred, y)
        
        self.log("val_loss", loss, on_epoch=True)
        self.log("val_acc", acc, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=1e-3)

In [8]:
model = NeuralNetwork(num_classes).to(device)
print(model)

NeuralNetwork(
  (net): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
  (acc_fn): MulticlassAccuracy()
)


In [9]:
total_params = sum(
    param.numel() for param in model.parameters()
)
print(f"Total number of model parameters: {total_params}")

Total number of model parameters: 7850


In [10]:
# Initialize a trainer
trainer = pl.Trainer(max_epochs=n_epochs)
trainer.strategy.barrier()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
with mlflow.start_run() as run:
    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)
    # trainer.test(dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | net     | Sequential         | 7.9 K 
1 | loss_fn | CrossEntropyLoss   | 0     
2 | acc_fn  | MulticlassAccuracy | 0     
-----------------------------------------------
7.9 K     Trainable params
0         Non-trainable params
7.9 K     Total params
0.031     Total estimated model params size (MB)


Epoch 4: 100%|█████████████████████████████████████████████████████████████████████████████████| 938/938 [00:17<00:00, 55.04it/s, v_num=4]
Validation DataLoader 0:  14%|██████████▌                                                                | 22/157 [00:00<00:02, 53.58it/s]