In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install wandb
!pip install catalyst



In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdimaorekhov[0m (use `wandb login --relogin` to force relogin)


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision.models as models
from torchvision import transforms, datasets
from catalyst import dl
from catalyst.contrib.callbacks import WandbLogger
from catalyst.utils import set_global_seed
from dataclasses import dataclass
from tqdm.auto import tqdm
from pathlib import Path

In [None]:
@dataclass
class Config:

    experiment_name: str = "finetune-vgg-on-cifar-10"

    flip_prob: float = 0.5
    rotation_degrees: float = 25

    logdir: str = 'drive/MyDrive/logdir_tune'

    n_conv_layers_to_tune: int = 1

    max_lr: float = 1e-4
    weight_decay: float = 0.000001

    num_epochs: int = 25
    batch_size: int = 32
    patience: int = 3

    seed: int = 21


config = Config()
set_global_seed(config.seed)

In [None]:
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)

train_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomHorizontalFlip(config.flip_prob),
    transforms.RandomRotation(config.rotation_degrees),
    transforms.ToTensor(),
    normalize
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor(),
    normalize
])

I will be first finetuning teacher on CIFAR10, since all pretrained torchvision models are for ImageNet.

And the ImageNet itself is too large to handle with computational resources I have.

In [None]:
train = datasets.CIFAR10('data', train=True, download=True, transform=train_transforms)
test = datasets.CIFAR10('data', train=False, download=True, transform=test_transforms)
print(len(train))
print(len(test))


N_CLASSES = 10

Files already downloaded and verified
Files already downloaded and verified
50000
10000


In [None]:
teacher = models.resnet18(pretrained=True).eval()

In [None]:
def freeze_module(module: nn.Module):
    for param in module.parameters():
        param.requires_grad = False


def unfreeze_module(module: nn.Module):
    for param in module.parameters():
        param.requires_grad = True


freeze_module(teacher)
teacher.fc = nn.Linear(512, N_CLASSES)


conv_layers_to_tune = [
    getattr(teacher, f"layer{i}")
    for i in range(4, config.n_conv_layers_to_tune, -1)
]
for m in [teacher.fc] + conv_layers_to_tune:
    unfreeze_module(m)
    m.train()


In [None]:
loaders = {
    'train': DataLoader(train, batch_size=config.batch_size, shuffle=True),
    'valid': DataLoader(test, batch_size=config.batch_size)
}

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(teacher.parameters(), weight_decay=config.weight_decay)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=config.max_lr,
    epochs=config.num_epochs,
    steps_per_epoch=len(loaders['train'])
)

callbacks = [
    dl.SchedulerCallback(mode='batch'),
    dl.EarlyStoppingCallback(config.patience),
    dl.AccuracyCallback(topk_args=[1, 3, 5], num_classes=N_CLASSES),
    WandbLogger(
        project='dl-course',
        entity='dimaorekhov',
        group='distillation',
        name=config.experiment_name,
        config=dict(config.__dict__)
    )
]

In [None]:
Path(config.logdir).absolute().mkdir(parents=True, exist_ok=True)

In [None]:
runner = dl.SupervisedRunner(device=torch.device("cuda"))
runner.train(
    model=teacher,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    callbacks=callbacks,
    num_epochs=config.num_epochs,
    verbose=True,
    logdir=config.logdir
)

[34m[1mwandb[0m: Currently logged in as: [33mdimaorekhov[0m (use `wandb login --relogin` to force relogin)


1/25 * Epoch (train):   0% 2/1563 [00:00<04:33,  5.71it/s, accuracy01=0.094, accuracy03=0.438, accuracy05=0.625, loss=2.299, lr=4.000e-06, momentum=0.950]


To get the last learning rate computed by the scheduler, please use `get_last_lr()`.



1/25 * Epoch (train): 100% 1563/1563 [03:19<00:00,  7.82it/s, accuracy01=0.875, accuracy03=1.000, accuracy05=1.000, loss=0.365, lr=8.151e-06, momentum=0.946]
1/25 * Epoch (valid): 100% 313/313 [00:28<00:00, 11.02it/s, accuracy01=0.938, accuracy03=1.000, accuracy05=1.000, loss=0.390]
[2020-11-30 10:55:46,143] 
1/25 * Epoch 1 (_base): lr=8.151e-06 | momentum=0.9457
1/25 * Epoch 1 (train): accuracy01=0.6608 | accuracy03=0.8787 | accuracy05=0.9438 | loss=1.1048 | lr=5.392e-06 | momentum=0.9485
1/25 * Epoch 1 (valid): accuracy01=0.8515 | accuracy03=0.9747 | accuracy05=0.9944 | loss=0.4889
2/25 * Epoch (train): 100% 1563/1563 [03:19<00:00,  7.82it/s, accuracy01=0.750, accuracy03=1.000, accuracy05=1.000, loss=0.534, lr=1.988e-05, momentum=0.933]
2/25 * Epoch (valid): 100% 313/313 [00:28<00:00, 11.05it/s, accuracy01=1.000, accuracy03=1.000, accuracy05=1.000, loss=0.092]
[2020-11-30 10:59:39,706] 
2/25 * Epoch 2 (_base): lr=1.988e-05 | momentum=0.9335
2/25 * Epoch 2 (train): accuracy01=0.8631 |

VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy01/train,0.95326
accuracy03/train,0.99602
accuracy05/train,0.99934
loss/train,0.13594
lr/train,0.0001
momentum/train,0.85022
accuracy01/valid,0.9369
accuracy03/valid,0.9942
accuracy05/valid,0.9984
loss/valid,0.19245


0,1
accuracy01/train,▁▆▇▇████
accuracy03/train,▁▇██████
accuracy05/train,▁▇██████
loss/train,█▃▂▂▁▁▁▁
lr/train,▁▂▃▄▆▇██
momentum/train,█▇▆▅▃▂▁▁
accuracy01/valid,▁▆██████
accuracy03/valid,▁▇▇███▇█
accuracy05/valid,▁▆▇███▇▇
loss/valid,█▃▁▁▁▁▁▁


Top best models:
drive/MyDrive/logdir_tune/checkpoints/train.5.pth	0.1842
