In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install wandb
!pip install catalyst



In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdimaorekhov[0m (use `wandb login --relogin` to force relogin)


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision.models as models
from torchvision import transforms, datasets
from catalyst import dl
from catalyst.contrib.callbacks import WandbLogger
from catalyst.utils import set_global_seed
from dataclasses import dataclass
from tqdm.auto import tqdm
from pathlib import Path

from torchvision.models.resnet import ResNet, BasicBlock

In [None]:
@dataclass
class Config:

    experiment_name: str = "train-from-scratch"

    flip_prob: float = 0.5
    rotation_degrees: float = 25

    logdir: str = 'logdir_from_scratch'

    student_layers: str = "[1, 1, 1, 1]"

    max_lr: float = 1e-3
    weight_decay: float = 1e-6

    num_epochs: int = 100
    batch_size: int = 32
    patience: int = 2

    seed: int = 21

    def to_dict(self):
        as_dict = {}
        for key, val in self.__dict__.items():
            if key in ["student_layers", "student_to_teacher_layers_map"]:
                val = eval(val)
            as_dict[key] = val
        return as_dict


config = Config()
set_global_seed(config.seed)

In [None]:
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)

RESIZE_BY = 256
train_transforms = transforms.Compose([
    transforms.Resize(RESIZE_BY),
    transforms.RandomHorizontalFlip(config.flip_prob),
    transforms.RandomRotation(config.rotation_degrees),
    transforms.ToTensor(),
    normalize
])

test_transforms = transforms.Compose([
    transforms.Resize(RESIZE_BY),
    transforms.ToTensor(),
    normalize
])

I will be first finetuning teacher on CIFAR10, since all pretrained torchvision models are for ImageNet.

And the ImageNet itself is too large to handle with computational resources I have.

In [None]:
train = datasets.CIFAR10('data', train=True, download=True, transform=train_transforms)
test = datasets.CIFAR10('data', train=False, download=True, transform=test_transforms)
print(len(train))
print(len(test))


N_CLASSES = 10

Files already downloaded and verified
Files already downloaded and verified
50000
10000


In [None]:
student = ResNet(
    BasicBlock,
    layers=eval(config.student_layers),
    num_classes=N_CLASSES
)

In [None]:
loaders = {
    'train': DataLoader(train, batch_size=config.batch_size, shuffle=True),
    'valid': DataLoader(test, batch_size=config.batch_size)
}

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student.parameters(), weight_decay=config.weight_decay)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=config.max_lr,
    epochs=config.num_epochs,
    steps_per_epoch=len(loaders['train'])
)

callbacks = [
    dl.SchedulerCallback(mode='batch'),
    dl.EarlyStoppingCallback(config.patience),
    dl.AccuracyCallback(topk_args=[1, 3, 5], num_classes=N_CLASSES),
    WandbLogger(
        project='dl-course',
        entity='dimaorekhov',
        group='distillation',
        name=config.experiment_name,
        config=dict(config.to_dict())
    )
]

In [None]:
Path(config.logdir).mkdir(parents=True, exist_ok=True)

In [None]:
runner = dl.SupervisedRunner()
runner.train(
    model=student,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    callbacks=callbacks,
    num_epochs=config.num_epochs,
    verbose=True,
    logdir=config.logdir
)

[34m[1mwandb[0m: Currently logged in as: [33mdimaorekhov[0m (use `wandb login --relogin` to force relogin)


1/100 * Epoch (train):   0% 1/1563 [00:00<06:33,  3.97it/s, accuracy01=0.062, accuracy03=0.312, accuracy05=0.531, loss=2.359, lr=4.000e-05, momentum=0.950]


To get the last learning rate computed by the scheduler, please use `get_last_lr()`.



1/100 * Epoch (train): 100% 1563/1563 [04:27<00:00,  5.84it/s, accuracy01=0.562, accuracy03=0.812, accuracy05=0.875, loss=1.384, lr=4.263e-05, momentum=0.950]
1/100 * Epoch (valid): 100% 313/313 [00:34<00:00,  9.10it/s, accuracy01=0.562, accuracy03=0.938, accuracy05=0.938, loss=1.282]
[2020-11-30 12:43:20,734] 
1/100 * Epoch 1 (_base): lr=4.263e-05 | momentum=0.9497
1/100 * Epoch 1 (train): accuracy01=0.4335 | accuracy03=0.7667 | accuracy05=0.8985 | loss=1.5707 | lr=4.088e-05 | momentum=0.9499
1/100 * Epoch 1 (valid): accuracy01=0.5167 | accuracy03=0.8356 | accuracy05=0.9427 | loss=1.3331
2/100 * Epoch (train): 100% 1563/1563 [04:30<00:00,  5.79it/s, accuracy01=0.625, accuracy03=0.875, accuracy05=0.938, loss=1.220, lr=5.049e-05, momentum=0.949]
2/100 * Epoch (valid): 100% 313/313 [00:34<00:00,  9.10it/s, accuracy01=0.562, accuracy03=0.875, accuracy05=1.000, loss=1.031]
[2020-11-30 12:48:25,703] 
2/100 * Epoch 2 (_base): lr=5.049e-05 | momentum=0.9489
2/100 * Epoch 2 (train): accuracy01

VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy01/train,0.90704
accuracy03/train,0.98874
accuracy05/train,0.99822
loss/train,0.26476
lr/train,0.00097
momentum/train,0.85334
accuracy01/valid,0.8782
accuracy03/valid,0.9821
accuracy05/valid,0.9955
loss/valid,0.37839


0,1
accuracy01/train,▁▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██████
accuracy03/train,▁▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██████████
accuracy05/train,▁▄▅▆▆▆▇▇▇▇▇▇▇▇█████████████
loss/train,█▆▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁
lr/train,▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇▇▇███
momentum/train,█████▇▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▂▁▁▁
accuracy01/valid,▁▃▃▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇█▇█████
accuracy03/valid,▁▃▄▅▅▅▆▆▆▇▆▇▇▇▇▇███████████
accuracy05/valid,▁▃▄▅▅▆▆▆▇▇▇▇█▇▇████████▇███
loss/valid,█▆▆▅▄▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▁▁


Top best models:
logdir_from_scratch/checkpoints/train.25.pth	0.3744
