# Домашняя работа 2

**Дедлайн – 23:59 23.03.25**

Сначала повторим загрузку данных и вспомогательные фнукции из ДЗ1

In [10]:
import imagehash
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision
import tqdm
import json

from collections import defaultdict
from PIL import Image
from sklearn.metrics import f1_score, accuracy_score, precision_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision.transforms import v2 as vision_transforms_v2
from torch.utils.tensorboard import SummaryWriter

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda:0'
elif torch.backends.mps.is_available():
    device = 'mps'
print(f'{device = }')

device = 'mps'


In [2]:
class OxfordPetDataset(Dataset):
    '''
    https://www.kaggle.com/datasets/devdgohil/the-oxfordiiit-pet-dataset
    '''

    def __init__(self, images_root):
        self.image_paths = []
        self.labels = []
        self.class_to_label = {}
        self.label_to_class = []
        for filename in os.listdir(images_root):
            # .../<images_root>/Egyptian_Mau_167.jpg
            if not filename.endswith('.jpg'):
                continue
            self.image_paths.append(os.path.join(images_root, filename))
            class_name = filename[:filename.rfind('_')]
            if class_name not in self.class_to_label:
                self.label_to_class.append(class_name)
                self.class_to_label[class_name] = len(self.class_to_label)
            self.labels.append(self.class_to_label[class_name])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        label = self.labels[idx]
        return {
            "image": image,
            "label": label,
        }


class CatBreedsDataset(Dataset):
    '''
    https://www.kaggle.com/datasets/ma7555/cat-breeds-dataset
    '''

    def __init__(self, images_root):
        self.image_paths = []
        self.labels = []
        self.class_to_label = {}
        self.label_to_class = []
        for class_folder in os.listdir(images_root):
            class_folder_path = os.path.join(images_root, class_folder)
            if not os.path.isdir(class_folder_path):
                continue
            for filename in os.listdir(class_folder_path):
                # .../<images_root>/Egyptian Mau/<...>.jpg
                if not filename.endswith('.jpg'):
                    continue
                self.image_paths.append(os.path.join(images_root, class_folder, filename))
                if class_folder not in self.class_to_label:
                    self.label_to_class.append(class_folder)
                    self.class_to_label[class_folder] = len(self.class_to_label)
                self.labels.append(self.class_to_label[class_folder])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        label = self.labels[idx]
        return {
            "image": image,
            "label": label,
        }


class ImagehashDeduplicationDataset(Dataset):
    def __init__(self, dataset, hash_func=imagehash.average_hash):
        self.dataset = dataset
        self.hash_func = hash_func
        self.image_hash_to_idx = dict()
        self.idx_is_duplicate_of = dict()
        self.index_mapping = list()
        for i in range(len(dataset)):
            image_hash = hash_func(dataset[i]['image'])
            if image_hash in self.image_hash_to_idx:
                self.idx_is_duplicate_of[i] = self.image_hash_to_idx[image_hash]
            else:
                self.image_hash_to_idx[image_hash] = i
                self.index_mapping.append(i)
        self.labels = [self.dataset.labels[idx] for idx in self.index_mapping]

    def __len__(self):
        return len(self.index_mapping)

    def __getitem__(self, idx):
        return self.dataset[self.index_mapping[idx]]


class TransformedDataset(Dataset):
    def __init__(self, dataset, transform):
        self.dataset = dataset
        self.transform = transform
        if hasattr(self.dataset, 'labels'):
            self.labels = self.dataset.labels
        else:
            self.labels = [self.dataset[idx]['label'] for idx in range(len(self.dataset))]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        item['image'] = self.transform(item['image'])
        return item


def collate_fn(batch):
    images = torch.stack([item['image'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
    return {'images': images, 'labels': labels}

train_transforms = torchvision.transforms.Compose([
    vision_transforms_v2.ToImage(),

    vision_transforms_v2.ToDtype(torch.uint8, scale=True),
    vision_transforms_v2.Resize((256, 256), antialias=True),
    vision_transforms_v2.RandomResizedCrop(size=(224, 224), antialias=True),
    vision_transforms_v2.RandomHorizontalFlip(p=0.5),
    vision_transforms_v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),

    vision_transforms_v2.ToDtype(torch.float32, scale=True),
    vision_transforms_v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transforms = torchvision.transforms.Compose([
    vision_transforms_v2.ToImage(),

    vision_transforms_v2.ToDtype(torch.uint8, scale=True),
    vision_transforms_v2.Resize((224, 224), antialias=True),

    vision_transforms_v2.ToDtype(torch.float32, scale=True),
    vision_transforms_v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

main_dataset = OxfordPetDataset('./oxford-pet-dataset/images/')
num_classes = len(set(main_dataset.labels))
print(f'Dataset size: {len(main_dataset)}')
print(f'Classes count: {num_classes}')

deduplicated_main_dataset = ImagehashDeduplicationDataset(main_dataset)
print(f'Duplicate count: {len(deduplicated_main_dataset.idx_is_duplicate_of)}')
print(f'Deduplicated dataset size: {len(deduplicated_main_dataset)}')

transformed_main_dataset = TransformedDataset(deduplicated_main_dataset, test_transforms)

weak_dataset = CatBreedsDataset('./cat-breeds-dataset/images/')
transformed_weak_dataset = TransformedDataset(weak_dataset, test_transforms)

Dataset size: 7390
Classes count: 37
Duplicate count: 74
Deduplicated dataset size: 7316


In [3]:
def calculate_embeddings(dataset):
    with torch.no_grad():
        embedder = torchvision.models.resnet18(pretrained=True)
        embedder.fc = nn.Identity()
        embedder.to(device)

        dl = DataLoader(
            dataset,
            batch_size=16,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True,
        )

        all_embeddings = []
        all_labels = []

        for batch in tqdm.tqdm(dl):
            embeds = embedder(batch['images'].to(device)).cpu().numpy()
            for label, embed in zip(batch['labels'], embeds):
                all_embeddings.append(embed)
                all_labels.append(label.item())

        return np.array(all_embeddings), np.array(all_labels)

embeddings_array, _ = calculate_embeddings(transformed_main_dataset)

100%|██████████| 458/458 [00:37<00:00, 12.23it/s]


In [4]:
def get_duplicate_candidates(embeddings_array, dist_threshold):
    scaler = StandardScaler()
    normalized_embeddings = scaler.fit_transform(embeddings_array)

    knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
    knn.fit(normalized_embeddings)

    distances, indices = knn.kneighbors(normalized_embeddings)
    duplicate_pairs = defaultdict(list)

    for idx, (dists, neighbors) in enumerate(zip(distances, indices)):
        for dist, neighbor_idx in zip(dists[1:], neighbors[1:]):
            if dist < dist_threshold and idx < int(neighbor_idx):
                duplicate_pairs[(idx, int(neighbor_idx))].append(float(dist))

    return sorted(duplicate_pairs.items(), key=lambda x: x[1][0])

In [5]:
class RemoveIndicesDataset(Dataset):
    def __init__(self, dataset, remove_indices):
        self.dataset = dataset
        self.remove_indices = remove_indices
        self.index_mapping = list(set(range(len(dataset))) - set(remove_indices))
        self.labels = [self.dataset.labels[idx] for idx in self.index_mapping]

    def __getitem__(self, i):
        return self.dataset[self.index_mapping[i]]

    def __len__(self):
        return len(self.index_mapping)


remove_indices = [p[1] for p, _ in get_duplicate_candidates(embeddings_array, 0.2)]

filtered_main_dataset = RemoveIndicesDataset(
    ImagehashDeduplicationDataset(
        OxfordPetDataset('./oxford-pet-dataset/images/')
    ),
    remove_indices
)

In [6]:
train_indices, test_indices = train_test_split(
    range(len(filtered_main_dataset)),
    test_size=0.25,
    stratify=filtered_main_dataset.labels,  # разбиваем классы равномерно
    random_state=42
)

train_dataset = TransformedDataset(
    Subset(filtered_main_dataset, train_indices),
    train_transforms
)
test_dataset = TransformedDataset(
    Subset(filtered_main_dataset, test_indices),
    test_transforms
)

print(len(train_dataset), len(test_dataset))

5470 1824


In [7]:
def get_all_metrics(true_labels, predicted_labels):
    classes = sorted(set(true_labels) | set(predicted_labels))

    return {
        'accuracy': accuracy_score(true_labels, predicted_labels),
        'micro_f1': f1_score(true_labels, predicted_labels, average='micro', labels=classes),
        'macro_f1': f1_score(true_labels, predicted_labels, average='macro', labels=classes),
        'class_wise_precision': dict(zip(
            map(int, classes),
            map(float, precision_score(true_labels, predicted_labels, average=None, labels=classes))
        )),
        'micro_precision': precision_score(true_labels, predicted_labels, average='micro', labels=classes)
    }


In [99]:
def train_epoch(model, criterion, optimizer, train_loader, n_iter, writer):
    model.train()

    running_loss = torch.tensor(0.0, device=device)
    running_count = torch.tensor(0)

    iterable = train_loader if writer is None else tqdm.tqdm(train_loader)

    for batch in iterable:
        inputs = batch['images'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.detach() * inputs.size(0)
        running_count += inputs.size(0)

        if writer is not None:
            writer.add_scalar('train/loss', loss.detach(), n_iter)
        n_iter += inputs.size(0)

    return running_loss / running_count


def validate(model, valid_loader, n_iter, writer):
    model.eval()

    labels = []
    predictions = []

    iterable = valid_loader if writer is None else tqdm.tqdm(valid_loader)

    with torch.no_grad():
        for batch in iterable:
            inputs = batch['images'].to(device)
            outputs = model(inputs)
            labels.append(batch['labels'].numpy())
            predictions.append(outputs.argmax(dim=1).cpu().numpy())

    labels = np.concatenate(labels)
    predictions = np.concatenate(predictions)

    micro_acc = accuracy_score(labels, predictions)
    micro_f1 = f1_score(labels, predictions, average='micro')
    macro_f1 = f1_score(labels, predictions, average='macro')
    micro_precision = precision_score(labels, predictions, average='micro')

    # print(f'Micro F1: {micro_f1:.4f}\tMacro F1: {macro_f1:.4f}')
    if writer is not None:
        writer.add_scalar('test/accuracy', micro_acc, n_iter)
        writer.add_scalar('test/macro_f1', macro_f1, n_iter)
        writer.add_scalar('test/micro_f1', micro_f1, n_iter)
        writer.add_scalar('test/micro_precision', micro_precision, n_iter)
        return {
            'micro_acc': micro_acc,
            'micro_f1': micro_f1,
            'macro_f1': macro_f1,
            'micro_precision': micro_precision,
        }
    else:
        return get_all_metrics(labels, predictions)


## 1. Эксперименты с моделями без тюнинга гиперпараметров  (3 балл)


Сначала попробуем обучить не голову resnet18, а полностью разморозить все веса

In [31]:
model = torchvision.models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

model.to(device)
criterion.to(device)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=0,
    collate_fn=collate_fn,
    pin_memory=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=0,
    collate_fn=collate_fn,
    pin_memory=True,
)

writer = SummaryWriter('./logs/resnet18-full_tune')



In [32]:
for epoch in range(10):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

100%|██████████| 86/86 [01:08<00:00,  1.26it/s]


Epoch 1 train_loss: 1.7037


100%|██████████| 29/29 [00:09<00:00,  3.08it/s]
100%|██████████| 86/86 [01:04<00:00,  1.33it/s]


Epoch 2 train_loss: 1.1967


100%|██████████| 29/29 [00:09<00:00,  2.96it/s]
100%|██████████| 86/86 [01:08<00:00,  1.26it/s]


Epoch 3 train_loss: 1.0865


100%|██████████| 29/29 [00:10<00:00,  2.73it/s]
100%|██████████| 86/86 [01:08<00:00,  1.25it/s]


Epoch 4 train_loss: 1.0024


100%|██████████| 29/29 [00:10<00:00,  2.79it/s]
100%|██████████| 86/86 [01:07<00:00,  1.27it/s]


Epoch 5 train_loss: 0.9494


100%|██████████| 29/29 [00:10<00:00,  2.75it/s]
100%|██████████| 86/86 [01:07<00:00,  1.28it/s]


Epoch 6 train_loss: 0.8175


100%|██████████| 29/29 [00:10<00:00,  2.87it/s]
100%|██████████| 86/86 [01:07<00:00,  1.27it/s]


Epoch 7 train_loss: 0.8313


100%|██████████| 29/29 [00:10<00:00,  2.68it/s]
100%|██████████| 86/86 [01:08<00:00,  1.26it/s]


Epoch 8 train_loss: 0.7594


100%|██████████| 29/29 [00:11<00:00,  2.52it/s]
100%|██████████| 86/86 [01:10<00:00,  1.23it/s]


Epoch 9 train_loss: 0.7263


100%|██████████| 29/29 [00:11<00:00,  2.51it/s]
100%|██████████| 86/86 [01:11<00:00,  1.20it/s]


Epoch 10 train_loss: 0.6823


100%|██████████| 29/29 [00:11<00:00,  2.47it/s]


За то же количество итераций получилось хуже качество, ещё и дольше училось.

Здесь на изображении оранжевое - бейзлайн, файнтюн одного слоя; красное - кандидат, файнтюн всей модели.
<img src="https://i.imgur.com/0YO6vws.png" style="width:90%;"/>

Теперь попробуем сначала претрейнить голову, затем немного дотюнить всю сетку с меньшим learning rate

In [45]:
writer = SummaryWriter('./logs/resnet18-fc_then_full')

model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
model.fc = nn.Linear(model.fc.in_features, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(5):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

for param in model.parameters():
    param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(5, 10):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

100%|██████████| 86/86 [00:39<00:00,  2.20it/s]


Epoch 1 train_loss: 2.2062


100%|██████████| 29/29 [00:11<00:00,  2.51it/s]
100%|██████████| 86/86 [00:39<00:00,  2.16it/s]


Epoch 2 train_loss: 1.1007


100%|██████████| 29/29 [00:10<00:00,  2.66it/s]
100%|██████████| 86/86 [00:40<00:00,  2.14it/s]


Epoch 3 train_loss: 0.8551


100%|██████████| 29/29 [00:11<00:00,  2.56it/s]
100%|██████████| 86/86 [00:41<00:00,  2.08it/s]


Epoch 4 train_loss: 0.7584


100%|██████████| 29/29 [00:11<00:00,  2.54it/s]
100%|██████████| 86/86 [00:41<00:00,  2.08it/s]


Epoch 5 train_loss: 0.6875


100%|██████████| 29/29 [00:10<00:00,  2.71it/s]
100%|██████████| 86/86 [01:08<00:00,  1.25it/s]


Epoch 6 train_loss: 0.5952


100%|██████████| 29/29 [00:11<00:00,  2.45it/s]
100%|██████████| 86/86 [01:14<00:00,  1.16it/s]


Epoch 7 train_loss: 0.4830


100%|██████████| 29/29 [00:11<00:00,  2.42it/s]
100%|██████████| 86/86 [01:14<00:00,  1.16it/s]


Epoch 8 train_loss: 0.4066


100%|██████████| 29/29 [00:10<00:00,  2.68it/s]
100%|██████████| 86/86 [01:05<00:00,  1.32it/s]


Epoch 9 train_loss: 0.3849


100%|██████████| 29/29 [00:08<00:00,  3.50it/s]
100%|██████████| 86/86 [00:55<00:00,  1.55it/s]


Epoch 10 train_loss: 0.3555


100%|██████████| 29/29 [00:08<00:00,  3.33it/s]


Такой подход помог значимо улучшить качество.

Здесь на изображении оранжевое - бейзлайн, файнтюн одного слоя; голубое - кандидат, последовательно файнтюн сначала последнего слоя, затем всей модели.
<img src="https://i.imgur.com/YqBguK9.png" style="width:90%;"/>

Попробуем новую архитектуру - efficientnet_b3 (по размеру очень похожа на resnet18). Оставим те же гиперпараметры/сетап обучения.

In [52]:
writer = SummaryWriter('./logs/effnetb3-fc_then_full')

model = torchvision.models.efficientnet_b3(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(5):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

for param in model.parameters():
    param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(5, 10):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

Downloading: "https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth" to /Users/danwallgun/.cache/torch/hub/checkpoints/efficientnet_b3_rwightman-b3899882.pth
100%|██████████| 47.2M/47.2M [00:03<00:00, 13.4MB/s]
100%|██████████| 86/86 [00:56<00:00,  1.51it/s]


Epoch 1 train_loss: 2.3710


100%|██████████| 29/29 [00:13<00:00,  2.16it/s]
100%|██████████| 86/86 [00:48<00:00,  1.77it/s]


Epoch 2 train_loss: 1.2545


100%|██████████| 29/29 [00:12<00:00,  2.40it/s]
100%|██████████| 86/86 [00:48<00:00,  1.77it/s]


Epoch 3 train_loss: 0.9758


100%|██████████| 29/29 [00:13<00:00,  2.16it/s]
100%|██████████| 86/86 [00:50<00:00,  1.71it/s]


Epoch 4 train_loss: 0.8698


100%|██████████| 29/29 [00:13<00:00,  2.18it/s]
100%|██████████| 86/86 [00:52<00:00,  1.65it/s]


Epoch 5 train_loss: 0.7915


100%|██████████| 29/29 [00:12<00:00,  2.29it/s]
100%|██████████| 86/86 [02:19<00:00,  1.63s/it]


Epoch 6 train_loss: 0.5715


100%|██████████| 29/29 [00:12<00:00,  2.38it/s]
100%|██████████| 86/86 [02:09<00:00,  1.51s/it]


Epoch 7 train_loss: 0.3931


100%|██████████| 29/29 [00:12<00:00,  2.40it/s]
100%|██████████| 86/86 [02:09<00:00,  1.50s/it]


Epoch 8 train_loss: 0.3410


100%|██████████| 29/29 [00:12<00:00,  2.40it/s]
100%|██████████| 86/86 [02:10<00:00,  1.52s/it]


Epoch 9 train_loss: 0.3207


100%|██████████| 29/29 [00:12<00:00,  2.38it/s]
100%|██████████| 86/86 [02:09<00:00,  1.51s/it]


Epoch 10 train_loss: 0.2772


100%|██████████| 29/29 [00:12<00:00,  2.40it/s]


Efficientnet, вопреки ожиданиям, не выбил лучшего качества (на imagenet метрики pretrained модели лучше resnet-ных), при этом учился сильно медленнее.

Здесь на изображении голубое - бейзлайн, файнтюн resnet последовательно головы/всей сети; розовое - кандидат, efficientnet в том же сетапе.
<img src="https://i.imgur.com/lizIxnX.png" style="width:90%;"/>

Теперь попробуем mobilenet_v3_large, он сильно эффективнее с точки зрения размера/компьюта, при этом с лучшим качеством на ImageNet согласно https://pytorch.org/vision/stable/models.html

In [57]:
writer = SummaryWriter('./logs/mobnetv3-fc_then_full')

model = torchvision.models.mobilenet_v3_large(
    weights=torchvision.models.MobileNet_V3_Large_Weights.IMAGENET1K_V2
)
for param in model.parameters():
    param.requires_grad = False
model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(5):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

for param in model.parameters():
    param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(5, 10):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

100%|██████████| 86/86 [00:39<00:00,  2.16it/s]


Epoch 1 train_loss: 2.2170


100%|██████████| 29/29 [00:08<00:00,  3.28it/s]
100%|██████████| 86/86 [00:35<00:00,  2.42it/s]


Epoch 2 train_loss: 1.0985


100%|██████████| 29/29 [00:08<00:00,  3.52it/s]
100%|██████████| 86/86 [00:32<00:00,  2.62it/s]


Epoch 3 train_loss: 0.8527


100%|██████████| 29/29 [00:07<00:00,  3.63it/s]
100%|██████████| 86/86 [00:37<00:00,  2.32it/s]


Epoch 4 train_loss: 0.7665


100%|██████████| 29/29 [00:08<00:00,  3.54it/s]
100%|██████████| 86/86 [00:34<00:00,  2.52it/s]


Epoch 5 train_loss: 0.6978


100%|██████████| 29/29 [00:08<00:00,  3.30it/s]
100%|██████████| 86/86 [00:58<00:00,  1.46it/s]


Epoch 6 train_loss: 0.5297


100%|██████████| 29/29 [00:08<00:00,  3.44it/s]
100%|██████████| 86/86 [00:53<00:00,  1.61it/s]


Epoch 7 train_loss: 0.3975


100%|██████████| 29/29 [00:08<00:00,  3.52it/s]
100%|██████████| 86/86 [00:53<00:00,  1.60it/s]


Epoch 8 train_loss: 0.3556


100%|██████████| 29/29 [00:08<00:00,  3.40it/s]
100%|██████████| 86/86 [00:53<00:00,  1.61it/s]


Epoch 9 train_loss: 0.3111


100%|██████████| 29/29 [00:09<00:00,  3.12it/s]
100%|██████████| 86/86 [00:57<00:00,  1.49it/s]


Epoch 10 train_loss: 0.2888


100%|██████████| 29/29 [00:10<00:00,  2.75it/s]


Получилось лучше, но не сильно)
На графиках есть намёки на переобучение, интересно увеличить аугментации/добавить регуляризации.

Здесь на изображении голубое - бейзлайн, файнтюн resnet последовательно головы/всей сети; розовое - кандидат, mobilenet_large в том же сетапе.
<img src="https://i.imgur.com/oWoTl4j.png" style="width:90%;"/>

Попробуем взять модель значительно больше - convnext_tiny - 28.6M параметров

In [63]:
writer = SummaryWriter('./logs/convnexttiny-fc_then_full')

model = torchvision.models.convnext_tiny(
    weights=torchvision.models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1
)
for param in model.parameters():
    param.requires_grad = False
model.classifier[2] = nn.Linear(model.classifier[2].in_features, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(5):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

for param in model.parameters():
    param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(5, 10):
    n_iter = epoch * len(train_loader.dataset)
    loss = train_epoch(model, criterion, optimizer, train_loader, n_iter, writer)
    print(f'Epoch {epoch+1} train_loss: {loss:.4f}')
    n_iter += len(train_loader.dataset)
    validate(model, test_loader, n_iter, writer)

100%|██████████| 86/86 [01:06<00:00,  1.28it/s]


Epoch 1 train_loss: 1.4550


100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 86/86 [01:01<00:00,  1.41it/s]


Epoch 2 train_loss: 0.4672


100%|██████████| 29/29 [00:17<00:00,  1.63it/s]
100%|██████████| 86/86 [01:00<00:00,  1.42it/s]


Epoch 3 train_loss: 0.3845


100%|██████████| 29/29 [00:17<00:00,  1.63it/s]
100%|██████████| 86/86 [01:00<00:00,  1.41it/s]


Epoch 4 train_loss: 0.3347


100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 86/86 [01:01<00:00,  1.41it/s]


Epoch 5 train_loss: 0.3047


100%|██████████| 29/29 [00:17<00:00,  1.61it/s]
100%|██████████| 86/86 [04:18<00:00,  3.01s/it]


Epoch 6 train_loss: 0.2924


100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 86/86 [04:14<00:00,  2.95s/it]


Epoch 7 train_loss: 0.2434


100%|██████████| 29/29 [00:17<00:00,  1.65it/s]
100%|██████████| 86/86 [04:12<00:00,  2.94s/it]


Epoch 8 train_loss: 0.2015


100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 86/86 [04:15<00:00,  2.97s/it]


Epoch 9 train_loss: 0.1885


100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 86/86 [04:15<00:00,  2.97s/it]


Epoch 10 train_loss: 0.1896


100%|██████████| 29/29 [00:17<00:00,  1.65it/s]


В ~2 раза большая модель действительно дала значительный буст к качеству.

Здесь на изображении голубое - resnet, зеленое mobnet, белое - convnext в том же сетапе
<img src="https://i.imgur.com/dAMKDl3.png" style="width:90%;"/>

## 2. Эксперименты с моделями с тюнингом гиперпараметров  (4 балл)

- выбрать не менее двух моделей из предыдущего пункта
- подберите гиперпараметры (максимум 3 балла)
    - использование Grid Search / Random Search для оптимизации одного гиперпараметра – 0.5 балла
    - использование Grid Search / Random Search для оптимизации не менее двух гиперпараметров – 1 балл
    - использование другого метода оптимизации гиперпараметров – 1 балл за каждый
- при обучении трекать метрики с помощью wandb / clearml / mlflow или другого инструмента трекинга экспериментов (1 балл)
- приложить скриншоты метрик

Выберем mobnet и resnet модели для перебора гиперпараметров. Convnext тяжеловат и медленно учится.  

Будем перебирать learning_rate до разморозки и после, а также weight_decay.

Для простоты тестировать будем не на отдельном test сете, а на основном.  Чтобы ускорить процесс перебора, оставим только одну эпоху обучения головы и одну эпоху обучения всей сетки.

In [101]:
def train_mobnet(lr_head, lr_full, weight_decay, epochs_head=5, epochs_full=5):
    model = torchvision.models.mobilenet_v3_large(
        weights=torchvision.models.MobileNet_V3_Large_Weights.IMAGENET1K_V2
    )
    for param in model.parameters():
        param.requires_grad = False
    model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_head, weight_decay=weight_decay)
    for epoch in range(epochs_head):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    for param in model.parameters():
        param.requires_grad = True
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_full, weight_decay=weight_decay)
    for epoch in range(epochs_head, epochs_head + epochs_full):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    return validate(model, test_loader, n_iter, None)


def train_resnet(lr_head, lr_full, weight_decay, epochs_head=5, epochs_full=5):
    model = torchvision.models.resnet18(
        weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1
    )
    for param in model.parameters():
        param.requires_grad = False
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_head, weight_decay=weight_decay)
    for epoch in range(epochs_head):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    for param in model.parameters():
        param.requires_grad = True
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_full, weight_decay=weight_decay)
    for epoch in range(epochs_head, epochs_head + epochs_full):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    return validate(model, test_loader, n_iter, None)

In [103]:
from sklearn.model_selection import ParameterSampler

param_grid = {
    'lr_head': np.logspace(start=-5, stop=-2, num=10),
    'lr_full': np.logspace(start=-5, stop=-2, num=10),
    'weight_decay': np.linspace(start=0.0, stop=0.3, num=10),
}

hparam_writer = SummaryWriter('./logs/mobnet-hparam_tuning')

for params in tqdm.tqdm(ParameterSampler(param_grid, n_iter=10, random_state=42)):
    metrics = train_mobnet(**params, epochs_head=1, epochs_full=1)
    hparam_writer.add_hparams(
        params,
        {
            'hparam/accuracy': metrics['accuracy'],
            'hparam/micro_f1': metrics['micro_f1'],
            'hparam/macro_f1': metrics['macro_f1'],
            'hparam/micro_precision': metrics['micro_precision'],
        }
    )

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 10/10 [15:02<00:00, 90.21s/it]


Посмотрим на результаты перебора в tensorboard:

Таблица
<img src="https://i.imgur.com/nLHAxjB.png" style="width:90%;"/>

Параллельные координаты
<img src="https://i.imgur.com/AyWZQ4q.png" style="width:90%;"/>


Как мы видим, даже за малое количество эпох получаются очень хорошие результаты при правильно выборе гиперпараметров. Интуитивно, что в топе находятся эксперименты с большими learning_rate-ами, это позволяет сделать большой прогресс за малое число эпох, а верхняя граница перебора learning rate при этом выставлена достаточно адекватно (не очень большим), чтобы обучение все ещё нормально сходилось, а не прыгало большими шагами по весам.

Попробуем сдвинуть перебираемый отрезок learning_rate-ов.

In [104]:
from sklearn.model_selection import ParameterSampler

param_grid = {
    'lr_head': np.logspace(start=-4, stop=-1, num=10),
    'lr_full': np.logspace(start=-4, stop=-1, num=10),
    'weight_decay': np.linspace(start=0.0, stop=0.3, num=10),
}

hparam_writer = SummaryWriter('./logs/mobnet-hparam_tuning-2')

for params in tqdm.tqdm(ParameterSampler(param_grid, n_iter=10, random_state=4242)):
    metrics = train_mobnet(**params, epochs_head=1, epochs_full=1)
    hparam_writer.add_hparams(
        params,
        {
            'hparam/accuracy': metrics['accuracy'],
            'hparam/micro_f1': metrics['micro_f1'],
            'hparam/macro_f1': metrics['macro_f1'],
            'hparam/micro_precision': metrics['micro_precision'],
        }
    )

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 10/10 [15:38<00:00, 93.84s/it]


Смотрим новые результаты в tensorboard:

Параллельные координаты
<img src="https://i.imgur.com/qTWDFba.png" style="width:90%;"/>

Подтверждаем интуицию, что для обучения головы нужен достаточно большой lr (видимо, оптимум приближается к, но не выше ~0.002). А для тюна полной модели нужен достаточно маленький, чтобы не портить хорошие pretrained фичи (оптимум лежит в [1e-5, 2e-4]).  
Weight decay при этом не сильно влияет на качество - ему достаточно быть не "совсем жёстким" (в нашем переборе <= 0.16).

Подвинем ещё границы перебора с обновлёнными знаниями, в этот раз переберём resnet18.

In [106]:
from sklearn.model_selection import ParameterSampler

param_grid = {
    'lr_head': np.logspace(start=-3.5, stop=-2.5, num=10),
    'lr_full': np.logspace(start=-5, stop=-3.5, num=10),
    'weight_decay': np.linspace(start=0.0, stop=0.2, num=10),
}

hparam_writer = SummaryWriter('./logs/resnet-hparam_tuning-3')

for params in tqdm.tqdm(ParameterSampler(param_grid, n_iter=10, random_state=424242)):
    metrics = train_resnet(**params, epochs_head=1, epochs_full=1)
    hparam_writer.add_hparams(
        params,
        {
            'hparam/accuracy': metrics['accuracy'],
            'hparam/micro_f1': metrics['micro_f1'],
            'hparam/macro_f1': metrics['macro_f1'],
            'hparam/micro_precision': metrics['micro_precision'],
        }
    )

100%|██████████| 10/10 [11:19<00:00, 67.93s/it]


Полные результаты отфильтровал по macro_f1 >= 0.75

Параллельные координаты  
<img src="https://i.imgur.com/IGVzdPr.png" style="width:90%;"/>

Таблица  
<img src="https://i.imgur.com/Dl41fir.png" style="width:90%;"/>

Как мы видим, все resnet-запуски оказались с около-топовыми метриками (0.8+). Значит, заданные интервалы оптимума параметров выбраны хорошо.

Попробуем обучить convnext с топ1 гиперпараметрами.

lr_head=0.0021544, lr_full=0.000046416, weight_decay=0.0

In [108]:
def train_convnext(lr_head, lr_full, weight_decay, epochs_head=5, epochs_full=5):
    model = torchvision.models.convnext_tiny(
        weights=torchvision.models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1
    )
    for param in model.parameters():
        param.requires_grad = False
    model.classifier[2] = nn.Linear(model.classifier[2].in_features, num_classes)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_head, weight_decay=weight_decay)
    for epoch in range(epochs_head):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    for param in model.parameters():
        param.requires_grad = True
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_full, weight_decay=weight_decay)
    for epoch in range(epochs_head, epochs_head + epochs_full):
        train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

    return validate(model, test_loader, n_iter, None)

metrics = train_convnext(lr_head=0.0021544, lr_full=0.000046416, weight_decay=0.0, epochs_head=1, epochs_full=1)
print(f'{metrics["macro_f1"] = }\t{metrics["micro_precision"] = }')

metrics["macro_f1"] = 0.9453548278400434	metrics["micro_precision"] = 0.9457236842105263


Получились примерно такие же метрики, как с моими дефолтными гиперами, но в 5 раз меньше эпох.

Потенциально можно выбить ещё больше качества, если учить больше эпох (в том числе можно дополнительно подобрать гиперпараметры). 

## 3. Итоговый выбор модели (1 балл)

- Проанализировать результаты обучения (0.5 балла)
- Выбрать модель, которая пойдёт "в прод", с обоснованием (0.5 балла)

- Результаты обучений проанализированы рядом с соответствующими обучениями
- Для выкатки в прод
    - я бы выбрал ConvNext, если ограничения по ресурсам/скорости позволяют, т.к. она даёт заметно лучшее качество
    - однако если требования к компактности/скорости/latency более жесткие, я бы выбрал mobilenet, т.к. она даёт лучшее качество из небольших эффективных моделей

## 4. Демо инференса модели  (2 балл)

- сделать простейший интерфейс для демо инференса модели (1 балл)
- подготовить и описать способ быстро запустить демо (1 балл)
    - необходимо выполнить более 2 действий для запуска демо – 0.5 балла
    - подготовлен скрипт, который позволяет запустить демо за 1 действие – 0.5 балла
    - запуск демо должен быть воспроизводимым на других устройствах (под управлением linux)

In [113]:
model = torchvision.models.mobilenet_v3_large(
    weights=torchvision.models.MobileNet_V3_Large_Weights.IMAGENET1K_V2
)
for param in model.parameters():
    param.requires_grad = False
model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0021544, weight_decay=0.0)
for epoch in range(1):
    train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

for param in model.parameters():
    param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=0.000046416, weight_decay=0.0)
for epoch in range(1, 2):
    train_epoch(model, criterion, optimizer, train_loader, n_iter, None)

metrics = validate(model, test_loader, n_iter, None)
print(f'{metrics["macro_f1"] = }')
torch.save(model, 'production-model.pt')

metrics["macro_f1"] = 0.9002829633133199


In [114]:
with open('label_to_class.json', 'w') as f:
    json.dump(main_dataset.label_to_class, f)

In [135]:
%%writefile web.py

import io
import json
import torch
import gradio as gr
from torchvision.transforms import v2 as vision_transforms_v2
from PIL import Image


def load_transforms():
    return vision_transforms_v2.Compose([
        vision_transforms_v2.ToImage(),

        vision_transforms_v2.ToDtype(torch.uint8, scale=True),
        vision_transforms_v2.Resize((224, 224), antialias=True),

        vision_transforms_v2.ToDtype(torch.float32, scale=True),
        vision_transforms_v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])


def load_class_names(filename):
    with open(filename, 'r') as f:
        return json.load(f)


def load_model(filename):
    model = torch.load(filename, weights_only=False, map_location='cpu')
    model.eval()
    return model


def get_prediction(image, transforms, model, label_to_class):
    image_input = transforms(image).unsqueeze(0)
    outputs = model(image_input)
    predicted = outputs.argmax(dim=1).cpu().numpy()
    label = predicted.item()
    return label, label_to_class[label]


def main():
    label_to_class = load_class_names('label_to_class.json')
    model = load_model('production-model.pt')
    transforms = load_transforms()

    fn = lambda image: get_prediction(image, transforms, model, label_to_class)

    interface = gr.Interface(
        fn=fn,
        inputs=gr.Image(type="pil"),
        outputs=[
            gr.Number(label="Class ID"),
            gr.Textbox(label="Class Name")
        ],
        title="Image Classification Demo",
        description="Upload an image to classify it using a trained model",
    )
    interface.launch(share=True)

if __name__ == '__main__':
    main()

Overwriting web.py


Чтобы запустить Web демонстрацию с интерфейсом, нужен один скрипт web.py:

In [136]:
!python web.py

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://a871fbedf17807ed27.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
^C
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a871fbedf17807ed27.gradio.live


Выглядит это так:

<img src="https://i.imgur.com/PZHI8w0.png" style="width:90%;"/>

(кстати, фотографии не было в train и ответ модели верный)  
(черипик, получается :D)