In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import Subset
import numpy as np

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(
    root="./data", train=True, download=True, transform=transform
)

targets = train_dataset.targets.numpy()
indices = []

imbalance_ratio = {
    8: 0.1,   # 10% of samples
    9: 0.01   # 1% of samples
}

for digit in range(10):
    digit_indices = np.where(targets == digit)[0]
    if digit in imbalance_ratio:
        n_keep = int(len(digit_indices) * imbalance_ratio[digit])
        digit_indices = np.random.choice(digit_indices, n_keep, replace=False)
    indices.extend(digit_indices)

imbalanced_train_dataset = Subset(train_dataset, indices)


import matplotlib.pyplot as plt

labels = train_dataset.targets[indices].numpy()
unique, counts = np.unique(labels, return_counts=True)

plt.bar(unique, counts)
plt.xlabel("Digit")
plt.ylabel("Number of samples")
plt.title("Class distribution (Imbalanced MNIST)")
plt.show()

import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)


def per_class_accuracy(model, dataloader, device):
    correct = torch.zeros(10)
    total = torch.zeros(10)

    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(dim=1)

            for i in range(10):
                mask = (y == i)
                correct[i] += (preds[mask] == i).sum().item()
                total[i] += mask.sum().item()

    return (correct / total).cpu().numpy()

acc = per_class_accuracy(model, test_loader, device)

plt.bar(range(10), acc)
plt.xlabel("Digit")
plt.ylabel("Accuracy")
plt.title("Per-class accuracy on imbalanced MNIST")
plt.ylim(0, 1)
plt.show()
