# Notebook pro trénink s destilací nad datasetem CIFAR100
V tomto notebooku je trénován MobileNetV2 nad datasetem CIFAR100, jako učitelsý model je využíván finetunued ViT nad stejným datasetem. 

MobileNetV2 je používán s náhodnou inicializací, tréninkem pouze klasifikační hlavy inicializovaného (předtrénovaného nad ImageNetem) MobileNetuV2 a trénink celého modelu, taktéž inicializovaného. Tyto tři úlohy jsou trénovány bězným způsobem a také s pomocí destilace výše zmíněného modelu.  

Při destilaci je využíváno předpočítaných logitů ze sešitu precompute_logits.

## Import knihoven a definice metod

In [1]:
from transformers import Trainer, EarlyStoppingCallback
from torch.utils.data import ConcatDataset, DataLoader
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
dataset_part = base.get_dataset_part()
DATASET = "cifar100"

Inicializovaný MobileNetV2.

In [3]:
base.reset_seed()

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


Provedení transformací nad datasetem.

In [5]:
transform = base.base_transforms()

train = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TRAIN, transform=transform)
eval = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.EVAL, transform=transform)
test = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TEST, transform=transform)


In [6]:
augment_transform = base.aug_transforms()

train_aug = base.CustomCIFAR100L(root=f"{os.path.expanduser('~')}/data/100-logits", dataset_part=dataset_part.TRAIN, transform=augment_transform)

In [7]:
train_part_cpu = base.CustomCIFAR100(root=f"{os.path.expanduser('~')}/data/100", train=True, transform=transform, device="cpu")
cpu_data_loader = DataLoader(train_part_cpu, batch_size=1, shuffle=False)
train_part_gpu = base.CustomCIFAR100(root=f"{os.path.expanduser('~')}/data/100", train=True, transform=transform, device="cuda")
gpu_data_loader = DataLoader(train_part_gpu, batch_size=1, shuffle=False)

In [8]:
train_aug = base.remove_diff_pred_class(train, train_aug, pytorch_dataset=True)
print(len(train_aug))
train_combo = ConcatDataset([train, train_aug])

Removing entries from augmented dataset that are different from the base one - based on saved logits:   0%|   …

25912


### Standardní trénink náhodně inicializovaného modelu. 

In [9]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/random-base_aug", logging_dir=f"~/logs/{DATASET}/random-base_aug", lr=0.0005, weight_decay=0.008, warmup_steps=5, epochs=20)
model = base.get_random_init_mobilenet(100)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0121,3.491575,0.1459,0.14565,0.1459,0.111652
2,3.2215,2.753546,0.2826,0.281917,0.2826,0.253578
3,2.7203,2.431204,0.3573,0.359042,0.3573,0.338813
4,2.3779,2.163932,0.4146,0.411753,0.4146,0.395408
5,2.1077,1.998699,0.4525,0.461214,0.4525,0.439678
6,1.8865,1.858586,0.4895,0.496367,0.4895,0.479106
7,1.6919,1.764659,0.514,0.522273,0.514,0.505659
8,1.5161,1.706783,0.5297,0.539177,0.5297,0.525424
9,1.3513,1.659283,0.5405,0.551045,0.5405,0.53819
10,1.1999,1.627684,0.5489,0.551294,0.5489,0.54523


TrainOutput(global_step=9270, training_loss=1.5059889956971202, metrics={'train_runtime': 2087.5688, 'train_samples_per_second': 631.471, 'train_steps_per_second': 4.934, 'total_flos': 2.520215532845531e+18, 'train_loss': 1.5059889956971202, 'epoch': 18.0})

In [12]:
model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [13]:
trainer.evaluate(test)

{'eval_loss': 1.7099978923797607,
 'eval_accuracy': 0.5647,
 'eval_precision': 0.5753105173095028,
 'eval_recall': 0.5647,
 'eval_f1': 0.5652796275462485,
 'eval_runtime': 13.7991,
 'eval_samples_per_second': 724.683,
 'eval_steps_per_second': 5.725,
 'epoch': 18.0}

In [14]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-base_aug.pth")

## Definice destilačního tréninku

Třída, která upravuje hugging face trenéra pro destilaci znalostí. Nově pracuje s logity uloženými v datasetu.

### Trénink náhodně inicializovaného modelu s pomocí destilace znalostí

In [15]:
base.reset_seed()

In [16]:
student_model = base.get_random_init_mobilenet(100)

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/random-distill_aug", logging_dir=f"~/logs/{DATASET}/random-distill_aug", remove_unused_columns=False, epochs=20, lr=0.00055, lambda_param=.7, temp=6)

In [18]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0755,2.773163,0.1706,0.186867,0.1706,0.125151
2,2.5881,2.309701,0.2944,0.310505,0.2944,0.253519
3,2.2566,2.057659,0.3765,0.40082,0.3765,0.356979
4,2.0124,1.855225,0.4336,0.447542,0.4336,0.411304
5,1.8226,1.732531,0.4748,0.490015,0.4748,0.457242
6,1.6744,1.623069,0.502,0.511607,0.502,0.487226
7,1.5413,1.536804,0.5223,0.533391,0.5223,0.511381
8,1.428,1.465016,0.5494,0.554212,0.5494,0.540067
9,1.3209,1.443941,0.5549,0.567294,0.5549,0.54977
10,1.2284,1.378345,0.5711,0.5729,0.5711,0.565444


TrainOutput(global_step=10300, training_loss=1.370963890297899, metrics={'train_runtime': 2324.7587, 'train_samples_per_second': 567.044, 'train_steps_per_second': 4.431, 'total_flos': 2.800239480939479e+18, 'train_loss': 1.370963890297899, 'epoch': 20.0})

In [20]:
student_model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [21]:
trainer.evaluate(test)

{'eval_loss': 1.1964892148971558,
 'eval_accuracy': 0.5968,
 'eval_precision': 0.6129856346358608,
 'eval_recall': 0.5968,
 'eval_f1': 0.5981585844379341,
 'eval_runtime': 13.1559,
 'eval_samples_per_second': 760.115,
 'eval_steps_per_second': 6.005,
 'epoch': 20.0}

In [22]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-distill_aug.pth")

## Získání inicializovaného MobileNetV2 modelu

In [23]:
base.reset_seed()

In [24]:
model_pretrained = base.get_mobilenet(100)

In [25]:
print(model_pretrained)

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [26]:
model_pretrained = base.freeze_model(model_pretrained)

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/head-base_aug", logging_dir=f"~/logs/{DATASET}/head-base_aug", epochs=20, lr=0.001, weight_decay=0.005, warmup_steps=20)

In [28]:
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6526,1.845513,0.518,0.542144,0.518,0.513308
2,1.965,1.743547,0.5319,0.553089,0.5319,0.529704
3,1.8277,1.706149,0.5398,0.564763,0.5398,0.538563
4,1.7554,1.676541,0.5503,0.568118,0.5503,0.548967
5,1.7058,1.697658,0.5443,0.563179,0.5443,0.540595
6,1.6669,1.694501,0.5448,0.562332,0.5448,0.541697
7,1.6378,1.677576,0.5465,0.567442,0.5465,0.545399


TrainOutput(global_step=3605, training_loss=1.8873015891827756, metrics={'train_runtime': 561.7114, 'train_samples_per_second': 2346.828, 'train_steps_per_second': 18.337, 'total_flos': 9.800838183288177e+17, 'train_loss': 1.8873015891827756, 'epoch': 7.0})

In [30]:
model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [31]:
trainer.evaluate(test)

{'eval_loss': 1.673932433128357,
 'eval_accuracy': 0.5476,
 'eval_precision': 0.5683108863736596,
 'eval_recall': 0.5476000000000001,
 'eval_f1': 0.5470175473783051,
 'eval_runtime': 12.6112,
 'eval_samples_per_second': 792.946,
 'eval_steps_per_second': 6.264,
 'epoch': 7.0}

In [32]:
torch.save(model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-base_aug.pth")

### Trénink inicializovaného MobileNetV2

In [69]:
base.reset_seed()

In [70]:
model_pretrained_whole = base.get_mobilenet(100)

In [71]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/pretrained-base_aug", logging_dir=f"~/logs/{DATASET}/pretrained-base_aug", epochs=20, lr=0.0004, weight_decay=0.008, warmup_steps=10)

In [72]:
trainer = Trainer(
    model=model_pretrained_whole,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [73]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5038,1.003019,0.7059,0.721646,0.7059,0.702884
2,0.6841,0.86437,0.7472,0.759304,0.7472,0.745554
3,0.4265,0.881176,0.7549,0.766942,0.7549,0.752936
4,0.2771,0.943901,0.748,0.762144,0.748,0.748892
5,0.1877,0.975006,0.7586,0.768252,0.7586,0.757383
6,0.1343,1.006056,0.754,0.761981,0.754,0.753194
7,0.0974,1.048559,0.7608,0.770917,0.7608,0.761292
8,0.0773,1.02975,0.7691,0.776869,0.7691,0.768439
9,0.0602,1.088217,0.7677,0.775303,0.7677,0.768304
10,0.0456,1.117871,0.7646,0.770101,0.7646,0.764286


TrainOutput(global_step=5665, training_loss=0.320696266288791, metrics={'train_runtime': 1266.9369, 'train_samples_per_second': 1040.494, 'train_steps_per_second': 8.13, 'total_flos': 1.5401317145167135e+18, 'train_loss': 0.320696266288791, 'epoch': 11.0})

In [74]:
model_pretrained_whole.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [75]:
trainer.evaluate(test)

{'eval_loss': 1.0523370504379272,
 'eval_accuracy': 0.7647,
 'eval_precision': 0.7730122282487011,
 'eval_recall': 0.7646999999999998,
 'eval_f1': 0.7644034733518786,
 'eval_runtime': 16.1603,
 'eval_samples_per_second': 618.801,
 'eval_steps_per_second': 4.889,
 'epoch': 11.0}

In [40]:
torch.save(model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-base_aug.pth")

## Trénink s pomocí destilace znalostí inicializovaného MobileNetV2

### Trénink inicializovaného modelu - pouze klasifikační hlavy s pomocí destilace

In [41]:
base.reset_seed()

In [42]:
student_model_pretrained = base.get_mobilenet(100)

In [43]:
student_model_pretrained = base.freeze_model(student_model_pretrained)

In [44]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/head-distill_aug", logging_dir=f"~/logs/{DATASET}/head-distill_aug", remove_unused_columns=False, epochs=20, lr=0.0012, weight_decay=.005, warmup_steps=15, lambda_param=.4, temp=5)

In [45]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained,
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6643,2.116916,0.5236,0.542366,0.5236,0.515046
2,2.2442,2.05931,0.5349,0.54856,0.5349,0.527563
3,2.1757,2.038093,0.5399,0.565486,0.5399,0.535931
4,2.1414,2.010734,0.5536,0.572997,0.5536,0.549893
5,2.1193,2.00126,0.5519,0.565355,0.5519,0.546022
6,2.1012,2.023759,0.5451,0.567735,0.5451,0.544335
7,2.0887,1.991565,0.5524,0.569127,0.5524,0.549587


TrainOutput(global_step=3605, training_loss=2.2192619165005203, metrics={'train_runtime': 612.2123, 'train_samples_per_second': 2153.24, 'train_steps_per_second': 16.824, 'total_flos': 9.800838183288177e+17, 'train_loss': 2.2192619165005203, 'epoch': 7.0})

In [47]:
student_model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [48]:
trainer.evaluate(test)

{'eval_loss': 1.9305979013442993,
 'eval_accuracy': 0.5429,
 'eval_precision': 0.5620144425218659,
 'eval_recall': 0.5428999999999999,
 'eval_f1': 0.5398014717487571,
 'eval_runtime': 12.6319,
 'eval_samples_per_second': 791.645,
 'eval_steps_per_second': 6.254,
 'epoch': 7.0}

In [49]:
torch.save(student_model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-distill_aug.pth")

### Trénink inicializovaného modelu s pomocí destilace

In [62]:
base.reset_seed()

In [63]:
student_model_pretrained_whole = base.get_mobilenet(100)

In [64]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/pretrained-distill_aug", logging_dir=f"~/logs/{DATASET}/pretrained-distill_aug", remove_unused_columns=False, epochs=20, lr=0.0005, weight_decay=.005, warmup_steps=5, lambda_param=.6, temp=6.5)

In [65]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained_whole.to(device),
    args=training_args,
    train_dataset=train_combo,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3011,0.946234,0.7107,0.723578,0.7107,0.706476
2,0.7305,0.820093,0.7461,0.758291,0.7461,0.74426
3,0.5426,0.771098,0.7599,0.771935,0.7599,0.759221
4,0.4265,0.757994,0.7658,0.779472,0.7658,0.766924
5,0.3495,0.760088,0.7626,0.77294,0.7626,0.762716
6,0.2944,0.736535,0.7787,0.791073,0.7787,0.779681
7,0.256,0.748447,0.7689,0.781456,0.7689,0.770089
8,0.2282,0.708099,0.7815,0.791045,0.7815,0.781683
9,0.2059,0.71865,0.7755,0.786709,0.7755,0.776588
10,0.1873,0.686474,0.7862,0.792348,0.7862,0.786376


TrainOutput(global_step=6695, training_loss=0.3850760291101685, metrics={'train_runtime': 1496.6729, 'train_samples_per_second': 880.78, 'train_steps_per_second': 6.882, 'total_flos': 1.8201556626106614e+18, 'train_loss': 0.3850760291101685, 'epoch': 13.0})

In [67]:
student_model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [68]:
trainer.evaluate(test)

{'eval_loss': 0.647940993309021,
 'eval_accuracy': 0.782,
 'eval_precision': 0.7897126741699193,
 'eval_recall': 0.782,
 'eval_f1': 0.7825311803661508,
 'eval_runtime': 12.2037,
 'eval_samples_per_second': 819.425,
 'eval_steps_per_second': 6.473,
 'epoch': 13.0}

In [57]:
torch.save(student_model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-distill_aug.pth")

In [58]:
base.count_parameters(student_model_pretrained_whole)

model size: 9.103MB.
Total Trainable Params: 2351972.


Unnamed: 0,Modules,Parameters
0,mobilenet_v2.conv_stem.first_conv.convolution....,864
1,mobilenet_v2.conv_stem.first_conv.normalizatio...,32
2,mobilenet_v2.conv_stem.first_conv.normalizatio...,32
3,mobilenet_v2.conv_stem.conv_3x3.convolution.we...,288
4,mobilenet_v2.conv_stem.conv_3x3.normalization....,32
...,...,...
153,mobilenet_v2.conv_1x1.convolution.weight,409600
154,mobilenet_v2.conv_1x1.normalization.weight,1280
155,mobilenet_v2.conv_1x1.normalization.bias,1280
156,classifier.weight,128000


In [59]:
base.count_parameters(student_model_pretrained)

model size: 9.103MB.
Total Trainable Params: 128100.


Unnamed: 0,Modules,Parameters
0,classifier.weight,128000
1,classifier.bias,100


In [60]:
cpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x708c319cbc70>
self.infer_speed_comp()
  28.52 ms
  1 measurement, 1000 runs , 4 threads


In [61]:
gpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x708c44180250>
self.infer_speed_comp()
  8.88 ms
  1 measurement, 1000 runs , 4 threads
