# Notebook pro trénink s destilací nad datasetem CIFAR100
V tomto notebooku je trénován MobileNetV2 nad datasetem CIFAR100, jako učitelsý model je využíván finetunued ViT nad stejným datasetem. 

MobileNetV2 je používán s náhodnou inicializací, tréninkem pouze klasifikační hlavy inicializovaného (předtrénovaného nad ImageNetem) MobileNetuV2 a trénink celého modelu, taktéž inicializovaného. Tyto tři úlohy jsou trénovány bězným způsobem a také s pomocí destilace výše zmíněného modelu.  

Při destilaci je využíváno předpočítaných logitů ze sešitu precompute_logits.

## Import knihoven a definice metod

In [1]:
from transformers import Trainer, EarlyStoppingCallback
from torch.utils.data import ConcatDataset, DataLoader
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
dataset_part = base.get_dataset_part()
DATASET = "cifar100"

Inicializovaný MobileNetV2.

In [3]:
base.reset_seed()

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


Provedení transformací nad datasetem.

In [5]:
transform = base.base_transforms()

train = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TRAIN, transform=transform)
eval = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.EVAL, transform=transform)
test = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TEST, transform=transform)


In [6]:
augment_transform = base.aug_transforms()

train_aug = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TRAIN, transform=augment_transform)

In [None]:
train_part_cpu = base.CustomCIFAR100(root=f"{os.path.expanduser('~')}/data/100", train=True, transform=transform, device="cpu")
cpu_data_loader = DataLoader(train_part_cpu, batch_size=1, shuffle=False)
train_part_gpu = base.CustomCIFAR100(root=f"{os.path.expanduser('~')}/data/100", train=True, transform=transform, device="cuda")
gpu_data_loader = DataLoader(train_part_gpu, batch_size=1, shuffle=False)

In [7]:
train_aug = base.remove_diff_pred_class(train, train_aug, pytorch_dataset=True)
print(len(train_aug))
train_combo = ConcatDataset([train, train_aug])

Removing entries from augmented dataset that are different from the base one - based on saved logits:   0%|   …

25912


### Standardní trénink náhodně inicializovaného modelu. 

In [8]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/random-base_aug", logging_dir=f"~/logs/{DATASET}/random-base_aug", lr=0.0005, weight_decay=0.008, adam_beta1=.95, epochs=30)
model = base.get_random_init_mobilenet(100)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)]
)

In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0339,3.428468,0.1568,0.14121,0.1568,0.122924
2,3.2525,2.792037,0.272,0.268424,0.272,0.245007
3,2.7433,2.434695,0.3538,0.362975,0.3538,0.338309
4,2.3827,2.166278,0.4052,0.41154,0.4052,0.387232
5,2.1132,2.019456,0.4402,0.457229,0.4402,0.426739
6,1.8982,1.849174,0.4919,0.499138,0.4919,0.481684
7,1.7042,1.789123,0.5051,0.520143,0.5051,0.497635
8,1.5457,1.711955,0.5202,0.52986,0.5202,0.513841
9,1.3959,1.712025,0.5282,0.543003,0.5282,0.525432
10,1.2568,1.680816,0.54,0.542465,0.54,0.535072


TrainOutput(global_step=15450, training_loss=0.9973500474602659, metrics={'train_runtime': 3476.9151, 'train_samples_per_second': 568.711, 'train_steps_per_second': 4.444, 'total_flos': 4.2003592214092186e+18, 'train_loss': 0.9973500474602659, 'epoch': 30.0})

In [11]:
model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [12]:
trainer.evaluate(test)

{'eval_loss': 2.2956089973449707,
 'eval_accuracy': 0.5726,
 'eval_precision': 0.586915179454515,
 'eval_recall': 0.5726,
 'eval_f1': 0.5754304134536806,
 'eval_runtime': 12.5294,
 'eval_samples_per_second': 798.122,
 'eval_steps_per_second': 6.305,
 'epoch': 30.0}

In [13]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-base_aug.pth")

In [None]:
base.count_parameters(model)

In [None]:
cpu_benchmark = base.BenchMarkRunner(model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

## Definice destilačního tréninku

Třída, která upravuje hugging face trenéra pro destilaci znalostí. Nově pracuje s logity uloženými v datasetu.

### Trénink náhodně inicializovaného modelu s pomocí destilace znalostí

In [14]:
base.reset_seed()

In [15]:
student_model = base.get_random_init_mobilenet(100)

In [16]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/random-distill_aug", logging_dir=f"~/logs/{DATASET}/random-distill_aug", remove_unused_columns=False, epochs=30, lr=0.00047, weight_decay=0, adam_beta1=.9, lambda_param=1, temp=6)

In [17]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)]
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6684,2.517675,0.123,0.082273,0.123,0.072299
2,2.3075,2.147085,0.2269,0.200528,0.2269,0.1698
3,2.0403,1.916113,0.3095,0.329565,0.3095,0.262866
4,1.8401,1.738407,0.3673,0.396748,0.3673,0.323879
5,1.6847,1.627085,0.4166,0.435528,0.4166,0.377489
6,1.5685,1.536948,0.4434,0.463355,0.4434,0.410584
7,1.4674,1.449725,0.4776,0.511175,0.4776,0.451558
8,1.3782,1.377894,0.5053,0.525554,0.5053,0.483456
9,1.2957,1.331499,0.5125,0.54465,0.5125,0.494482
10,1.2275,1.271082,0.5348,0.544508,0.5348,0.516617


TrainOutput(global_step=15450, training_loss=1.1061525590520074, metrics={'train_runtime': 3500.6795, 'train_samples_per_second': 564.85, 'train_steps_per_second': 4.413, 'total_flos': 4.2003592214092186e+18, 'train_loss': 1.1061525590520074, 'epoch': 30.0})

In [19]:
student_model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [20]:
trainer.evaluate(test)

{'eval_loss': 0.99275803565979,
 'eval_accuracy': 0.5856,
 'eval_precision': 0.6162042116757119,
 'eval_recall': 0.5856,
 'eval_f1': 0.5893476531603953,
 'eval_runtime': 13.9467,
 'eval_samples_per_second': 717.015,
 'eval_steps_per_second': 5.664,
 'epoch': 30.0}

In [21]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-distill_aug.pth")

In [None]:
base.count_parameters(student_model)

In [None]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

## Získání inicializovaného MobileNetV2 modelu

In [29]:
base.reset_seed()

In [30]:
model_pretrained = base.get_mobilenet(100)

In [31]:
print(model_pretrained)

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [32]:
model_pretrained = base.freeze_model(model_pretrained)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/head-base_aug", logging_dir=f"~/logs/{DATASET}/head-base_aug", epochs=30, lr=0.0005, weight_decay=0.008, adam_beta1=.95)

In [34]:
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9339,2.026314,0.5027,0.514339,0.5027,0.496137
2,2.1036,1.823858,0.5242,0.534446,0.5242,0.520287
3,1.9242,1.738237,0.542,0.55523,0.542,0.539177
4,1.8367,1.69554,0.5468,0.564201,0.5468,0.545267
5,1.7753,1.695191,0.5469,0.555206,0.5469,0.54309
6,1.728,1.690319,0.5497,0.559565,0.5497,0.544852
7,1.6962,1.663767,0.5515,0.566181,0.5515,0.549963
8,1.6691,1.663796,0.5532,0.557694,0.5532,0.547527
9,1.6507,1.668687,0.5453,0.556481,0.5453,0.543115
10,1.6295,1.648237,0.5535,0.560589,0.5535,0.550033


TrainOutput(global_step=8755, training_loss=1.7619355624910766, metrics={'train_runtime': 1398.9364, 'train_samples_per_second': 1413.474, 'train_steps_per_second': 11.044, 'total_flos': 2.380203558798557e+18, 'train_loss': 1.7619355624910766, 'epoch': 17.0})

In [36]:
model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [37]:
trainer.evaluate(test)

{'eval_loss': 1.62684965133667,
 'eval_accuracy': 0.5585,
 'eval_precision': 0.5616395177290857,
 'eval_recall': 0.5585,
 'eval_f1': 0.5537885639858433,
 'eval_runtime': 12.7267,
 'eval_samples_per_second': 785.751,
 'eval_steps_per_second': 6.207,
 'epoch': 17.0}

In [38]:
torch.save(model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-base_aug.pth")

In [None]:
base.count_parameters(model_pretrained)

In [None]:
cpu_benchmark = base.BenchMarkRunner(model_pretrained, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(model_pretrained, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

### Trénink inicializovaného MobileNetV2

In [39]:
base.reset_seed()

In [40]:
model_pretrained_whole = base.get_mobilenet(100)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/pretrained-base_aug", logging_dir=f"~/logs/{DATASET}/pretrained-base_aug", epochs=20, lr=0.0005, weight_decay=0.008, adam_beta1=.95)

In [42]:
trainer = Trainer(
    model=model_pretrained_whole,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)]
)

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4915,1.047975,0.6933,0.712134,0.6933,0.690917
2,0.7299,0.924017,0.7294,0.749419,0.7294,0.728627
3,0.4679,0.898334,0.7484,0.756741,0.7484,0.746663
4,0.3207,0.957441,0.7442,0.754756,0.7442,0.744441
5,0.2229,0.981622,0.7495,0.761695,0.7495,0.748991
6,0.1604,1.027975,0.7546,0.76552,0.7546,0.755144
7,0.1188,1.072139,0.7582,0.768324,0.7582,0.75914
8,0.0949,1.055918,0.762,0.770836,0.762,0.762276
9,0.0739,1.140815,0.7539,0.763926,0.7539,0.754559
10,0.0562,1.128205,0.7662,0.774019,0.7662,0.765329


TrainOutput(global_step=10300, training_loss=0.19342194788085604, metrics={'train_runtime': 2306.4903, 'train_samples_per_second': 571.535, 'train_steps_per_second': 4.466, 'total_flos': 2.800239480939479e+18, 'train_loss': 0.19342194788085604, 'epoch': 20.0})

In [44]:
model_pretrained_whole.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [45]:
trainer.evaluate(test)

{'eval_loss': 1.2248632907867432,
 'eval_accuracy': 0.781,
 'eval_precision': 0.7858923387637283,
 'eval_recall': 0.7810000000000001,
 'eval_f1': 0.7803628896975415,
 'eval_runtime': 12.7768,
 'eval_samples_per_second': 782.671,
 'eval_steps_per_second': 6.183,
 'epoch': 20.0}

In [46]:
torch.save(model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-base_aug.pth")

In [None]:
base.count_parameters(model_pretrained_whole)

In [None]:
cpu_benchmark = base.BenchMarkRunner(model_pretrained_whole, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(model_pretrained_whole, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

## Trénink s pomocí destilace znalostí inicializovaného MobileNetV2

### Trénink inicializovaného modelu - pouze klasifikační hlavy s pomocí destilace

In [59]:
base.reset_seed()

In [60]:
student_model_pretrained = base.get_mobilenet(100)

In [61]:
student_model_pretrained = base.freeze_model(student_model_pretrained)

In [62]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/head-distill_aug", logging_dir=f"~/logs/{DATASET}/head-distill_aug", remove_unused_columns=False, epochs=30, lr=0.00047, weight_decay=0, adam_beta1=.9, lambda_param=1, temp=6)

In [63]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2943,2.046931,0.4564,0.508589,0.4564,0.440878
2,2.0418,1.960814,0.4718,0.513312,0.4718,0.454191
3,1.9931,1.91881,0.4928,0.543451,0.4928,0.48234
4,1.9707,1.884658,0.5039,0.543614,0.5039,0.493852
5,1.9589,1.881358,0.4964,0.537423,0.4964,0.484253
6,1.95,1.890573,0.5,0.545802,0.5,0.491657


TrainOutput(global_step=3090, training_loss=2.034799974558809, metrics={'train_runtime': 904.3704, 'train_samples_per_second': 2186.449, 'train_steps_per_second': 17.084, 'total_flos': 8.400718442818437e+17, 'train_loss': 2.034799974558809, 'epoch': 6.0})

In [65]:
student_model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [66]:
trainer.evaluate(test)

{'eval_loss': 1.7282675504684448,
 'eval_accuracy': 0.4938,
 'eval_precision': 0.5313334269379325,
 'eval_recall': 0.4938,
 'eval_f1': 0.4817835637017797,
 'eval_runtime': 20.5763,
 'eval_samples_per_second': 485.995,
 'eval_steps_per_second': 3.839,
 'epoch': 6.0}

In [67]:
torch.save(student_model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-distill_aug.pth")

In [None]:
base.count_parameters(student_model_pretrained)

In [None]:
cpu_benchmark = base.BenchMarkRunner(student_model_pretrained, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(student_model_pretrained, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

### Trénink inicializovaného modelu s pomocí destilace

In [68]:
base.reset_seed()

In [69]:
student_model_pretrained_whole = base.get_mobilenet(100)

In [70]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/pretrained-distill_aug", logging_dir=f"~/logs/{DATASET}/pretrained-distill_aug", remove_unused_columns=False, epochs=20, lr=0.00047, weight_decay=0, adam_beta1=.9, lambda_param=1, temp=6)

In [71]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained_whole.to(device),
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [72]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1806,0.827808,0.7052,0.718642,0.7052,0.69994
2,0.6729,0.716642,0.736,0.748838,0.736,0.735044
3,0.5164,0.686679,0.7423,0.76122,0.7423,0.743089
4,0.4173,0.660962,0.7538,0.771644,0.7538,0.756705
5,0.3513,0.641553,0.7572,0.77587,0.7572,0.758899
6,0.304,0.634252,0.7596,0.779397,0.7596,0.761666
7,0.2668,0.625854,0.7567,0.772711,0.7567,0.75882
8,0.2404,0.612907,0.7602,0.775578,0.7602,0.761043
9,0.2192,0.596324,0.7668,0.779907,0.7668,0.76873
10,0.2002,0.586957,0.7699,0.783643,0.7699,0.771428


TrainOutput(global_step=9270, training_loss=0.3084165344732093, metrics={'train_runtime': 5285.1904, 'train_samples_per_second': 249.421, 'train_steps_per_second': 1.949, 'total_flos': 2.520215532845531e+18, 'train_loss': 0.3084165344732093, 'epoch': 18.0})

In [73]:
student_model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [74]:
trainer.evaluate(test)

{'eval_loss': 0.4647216796875,
 'eval_accuracy': 0.7682,
 'eval_precision': 0.7798459346561731,
 'eval_recall': 0.7681999999999999,
 'eval_f1': 0.7698816149836225,
 'eval_runtime': 29.3726,
 'eval_samples_per_second': 340.453,
 'eval_steps_per_second': 2.69,
 'epoch': 18.0}

In [75]:
torch.save(student_model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-distill_aug.pth")

In [None]:
base.count_parameters(student_model_pretrained_whole)

In [None]:
cpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())