# Notebook pro trénink s destilací nad datasetem CIFAR10
V tomto notebooku je trénován MobileNetV2 nad datasetem CIFAR10, jako učitelsý model je využíván finetunued ViT nad stejným datasetem. 

MobileNetV2 je používán s náhodnou inicializací, tréninkem pouze klasifikační hlavy inicializovaného (předtrénovaného nad ImageNetem) MobileNetuV2 a trénink celého modelu, taktéž inicializovaného. Tyto tři úlohy jsou trénovány bězným způsobem a také s pomocí destilace výše zmíněného modelu.  

Při destilaci je využíváno předpočítaných logitů ze sešitu precompute_logits.

## Import knihoven a definice metod

In [1]:
from transformers import Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import pandas as pd
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
dataset_part = base.get_dataset_part()

Resetování náhodného seedu pro replikovatelnost výsledků.

In [3]:
base.reset_seed()

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


Provedení transformací nad datasetem.

In [5]:
DATASET = "cifar10"

In [6]:
transform = base.base_transforms()

#Poslední train batch použijeme jako eval část...
test = base.CustomCIFAR10L(root=f"{os.path.expanduser('~')}/data/10-logits", dataset_part=dataset_part.TEST, transform=transform, device="cpu")
train = base.CustomCIFAR10L(root=f"{os.path.expanduser('~')}/data/10-logits", dataset_part=dataset_part.TRAIN, transform=transform, device="cpu")
eval = base.CustomCIFAR10L(root=f"{os.path.expanduser('~')}/data/10-logits", dataset_part=dataset_part.EVAL, transform=transform, device="cpu")

In [7]:
train[0]["labels"]

tensor(6)

In [8]:
# Test rozložení --> Good Enough
df = pd.DataFrame(eval.labels)
print(df.value_counts())

0
5    1025
9    1022
3    1016
0    1014
1    1014
8    1003
4     997
6     980
7     977
2     952
Name: count, dtype: int64


In [9]:
train_part_cpu = base.CustomCIFAR10(root=f"{os.path.expanduser('~')}/data/10", train=True, batch=1, transform=transform, device="cpu")
cpu_data_loader = DataLoader(train_part_cpu, batch_size=1, shuffle=False)
train_part_gpu = base.CustomCIFAR10(root=f"{os.path.expanduser('~')}/data/10", train=True, batch=1, transform=transform, device="cuda")
gpu_data_loader = DataLoader(train_part_gpu, batch_size=1, shuffle=False)

### Standardní trénink náhodně inicializovaného modelu. 

In [10]:
base.reset_seed()

In [11]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-random", logging_dir=f"~/logs/{DATASET}/cifar10-random", lr=0.0006, weight_decay=0.005, warmup_steps=25, epochs=20)
model = base.get_random_init_mobilenet(10)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5951,1.237562,0.5594,0.574408,0.559382,0.547235
2,1.024,0.871471,0.6903,0.69635,0.690017,0.689767
3,0.8029,0.788958,0.735,0.736337,0.734403,0.730367
4,0.6685,0.600232,0.7909,0.793263,0.790918,0.789307
5,0.5511,0.581511,0.8035,0.810439,0.803278,0.804223
6,0.4698,0.589092,0.7944,0.808125,0.794034,0.794595
7,0.3867,0.567914,0.8181,0.822736,0.81861,0.815618
8,0.3216,0.507972,0.8315,0.835464,0.831681,0.831222
9,0.2603,0.663696,0.8082,0.830179,0.807736,0.807199
10,0.1964,0.535166,0.8404,0.842509,0.840813,0.838381


TrainOutput(global_step=6260, training_loss=0.3380684549149614, metrics={'train_runtime': 1569.3447, 'train_samples_per_second': 509.767, 'train_steps_per_second': 3.989, 'total_flos': 1.6160796868608e+18, 'train_loss': 0.3380684549149614, 'epoch': 20.0})

In [15]:
model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [16]:
trainer.evaluate(test)

{'eval_loss': 0.7480902075767517,
 'eval_accuracy': 0.8519,
 'eval_precision': 0.8549866485308314,
 'eval_recall': 0.8519,
 'eval_f1': 0.8519241289796995,
 'eval_runtime': 12.6152,
 'eval_samples_per_second': 792.692,
 'eval_steps_per_second': 6.262,
 'epoch': 20.0}

In [17]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-base.pth")

## Definice destilačního tréninku

Třída, která upravuje hugging face trenéra pro destilaci znalostí. Nově pracuje s logity uloženými v datasetu.

### Trénink náhodně inicializovaného modelu s pomocí destilace znalostí

In [21]:
base.reset_seed()

In [22]:
student_model = base.get_random_init_mobilenet(10)

In [23]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-random-KD", logging_dir=f"~/logs/{DATASET}/cifar10-random-KD", remove_unused_columns=False, epochs=20, lr=0.00065, weight_decay=0.008, warmup_steps=5, lambda_param=.4, temp=6.5)

In [24]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2116,0.923582,0.5819,0.604312,0.58207,0.574919
2,0.8177,0.690882,0.7056,0.708314,0.705495,0.700665
3,0.6587,0.639006,0.7407,0.748375,0.739517,0.73784
4,0.5655,0.529884,0.7881,0.794764,0.787768,0.78878
5,0.4978,0.506643,0.8003,0.806032,0.800161,0.800127
6,0.4353,0.506764,0.8004,0.813745,0.800562,0.800548
7,0.3863,0.484542,0.8165,0.824948,0.817161,0.813032
8,0.3424,0.437639,0.841,0.844455,0.84126,0.840168
9,0.3075,0.485153,0.818,0.83535,0.81751,0.817941
10,0.2684,0.424161,0.8464,0.851865,0.846942,0.844988


TrainOutput(global_step=6260, training_loss=0.3607652871372601, metrics={'train_runtime': 1517.4651, 'train_samples_per_second': 527.195, 'train_steps_per_second': 4.125, 'total_flos': 1.6160796868608e+18, 'train_loss': 0.3607652871372601, 'epoch': 20.0})

In [26]:
student_model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [27]:
trainer.evaluate(test)

{'eval_loss': 0.38627079129219055,
 'eval_accuracy': 0.8581,
 'eval_precision': 0.8618272904669393,
 'eval_recall': 0.8581000000000001,
 'eval_f1': 0.8588283097517347,
 'eval_runtime': 13.2198,
 'eval_samples_per_second': 756.439,
 'eval_steps_per_second': 5.976,
 'epoch': 20.0}

In [None]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/random-distill.pth")

## Získání inicializovaného MobileNetV2 modelu

In [11]:
base.reset_seed()

In [12]:
model_pretrained = base.get_mobilenet(10)

In [13]:
print(model_pretrained)

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [14]:
model_pretrained = base.freeze_model(model_pretrained)

In [16]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-pretrained-head", logging_dir=f"~/logs/{DATASET}/cifar10-pretrained-head", epochs=20, lr=0.0025, weight_decay=0.005, warmup_steps=10)

In [17]:
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8742,0.760877,0.7363,0.759207,0.736365,0.73745
2,0.7235,0.722264,0.7494,0.758672,0.748687,0.748584
3,0.6996,0.739107,0.7472,0.761697,0.746236,0.745191
4,0.6955,0.679236,0.7666,0.777536,0.766793,0.767195
5,0.6863,0.682635,0.7598,0.763131,0.759497,0.760271
6,0.6819,0.676907,0.7639,0.770916,0.76332,0.764351
7,0.672,0.699199,0.7634,0.768724,0.763841,0.759322


TrainOutput(global_step=2191, training_loss=0.7190116976342317, metrics={'train_runtime': 365.6388, 'train_samples_per_second': 2187.952, 'train_steps_per_second': 17.121, 'total_flos': 5.6562789040128e+17, 'train_loss': 0.7190116976342317, 'epoch': 7.0})

In [19]:
model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [20]:
trainer.evaluate(test)

{'eval_loss': 0.6907253265380859,
 'eval_accuracy': 0.7613,
 'eval_precision': 0.772018627546524,
 'eval_recall': 0.7613,
 'eval_f1': 0.7617999755057994,
 'eval_runtime': 13.1351,
 'eval_samples_per_second': 761.32,
 'eval_steps_per_second': 6.014,
 'epoch': 7.0}

In [21]:
torch.save(model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-base.pth")

### Trénink inicializovaného MobileNetV2

In [22]:
base.reset_seed()

In [23]:
model_pretrained_whole = base.get_mobilenet(10)

In [24]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-pretrained", logging_dir=f"~/logs/{DATASET}/cifar10-pretrained", epochs=20, lr=0.00045, weight_decay=0.008, warmup_steps=3)

In [25]:
trainer = Trainer(
    model=model_pretrained_whole,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4026,0.316462,0.8966,0.906195,0.89659,0.897089
2,0.1696,0.262278,0.9182,0.923636,0.918248,0.919566
3,0.1087,0.267307,0.918,0.919754,0.918078,0.917876
4,0.0784,0.293605,0.9182,0.921895,0.918528,0.918707
5,0.0556,0.270866,0.9252,0.926872,0.925615,0.92513
6,0.0434,0.272729,0.9312,0.9323,0.931321,0.931164
7,0.0319,0.32534,0.9261,0.927471,0.926519,0.92584
8,0.0274,0.294779,0.9348,0.936635,0.935011,0.93485
9,0.0196,0.384792,0.9229,0.928752,0.922822,0.923312
10,0.0157,0.284001,0.9373,0.938128,0.937573,0.937226


TrainOutput(global_step=6260, training_loss=0.0494139636119905, metrics={'train_runtime': 1500.3325, 'train_samples_per_second': 533.215, 'train_steps_per_second': 4.172, 'total_flos': 1.6160796868608e+18, 'train_loss': 0.0494139636119905, 'epoch': 20.0})

In [27]:
model_pretrained_whole.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [28]:
trainer.evaluate(test)

{'eval_loss': 0.2871420085430145,
 'eval_accuracy': 0.9483,
 'eval_precision': 0.949222638232329,
 'eval_recall': 0.9483,
 'eval_f1': 0.9482737554304345,
 'eval_runtime': 12.3137,
 'eval_samples_per_second': 812.103,
 'eval_steps_per_second': 6.416,
 'epoch': 20.0}

In [29]:
torch.save(model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-base.pth")

## Trénink s pomocí destilace znalostí inicializovaného MobileNetV2

### Trénink inicializovaného modelu - pouze klasifikační hlavy s pomocí destilace

In [30]:
base.reset_seed()

In [31]:
student_model_pretrained = base.get_mobilenet(10)

In [32]:
student_model_pretrained = base.freeze_model(student_model_pretrained)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-pretrained-head-KD", logging_dir=f"~/logs/{DATASET}/cifar10-pretrained-head-KD", remove_unused_columns=False, epochs=20, lr=0.00065, weight_decay=.008, lambda_param=.6, temp=3.5)

In [34]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7389,0.641624,0.729,0.751844,0.728723,0.729765
2,0.6297,0.617656,0.7447,0.750206,0.74417,0.743413
3,0.6163,0.617649,0.7513,0.757247,0.750454,0.7501
4,0.6119,0.594144,0.7598,0.76775,0.759944,0.759688
5,0.6083,0.606135,0.7471,0.756662,0.746875,0.749526
6,0.6067,0.592881,0.761,0.764229,0.760641,0.760734
7,0.6039,0.603333,0.7601,0.761047,0.760031,0.755976
8,0.6008,0.606672,0.7566,0.766292,0.756417,0.756099
9,0.6023,0.60094,0.7585,0.763569,0.758112,0.758541


TrainOutput(global_step=2817, training_loss=0.6243223678485951, metrics={'train_runtime': 470.0682, 'train_samples_per_second': 1701.881, 'train_steps_per_second': 13.317, 'total_flos': 7.2723585908736e+17, 'train_loss': 0.6243223678485951, 'epoch': 9.0})

In [36]:
student_model_pretrained.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [37]:
trainer.evaluate(test)

{'eval_loss': 0.5939173102378845,
 'eval_accuracy': 0.762,
 'eval_precision': 0.7652446763292213,
 'eval_recall': 0.7619999999999999,
 'eval_f1': 0.7621432153880755,
 'eval_runtime': 13.2693,
 'eval_samples_per_second': 753.619,
 'eval_steps_per_second': 5.954,
 'epoch': 9.0}

In [38]:
torch.save(student_model_pretrained.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/head-distill.pth")

### Trénink inicializovaného modelu s pomocí destilace

In [39]:
base.reset_seed()

In [40]:
student_model_pretrained_whole = base.get_mobilenet(10)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/cifar10-pretrained-KD", logging_dir=f"~/logs/{DATASET}/cifar10-pretrained-KD", remove_unused_columns=False, epochs=20, lr=0.00055, weight_decay=.005, lambda_param=.3, temp=2.5)

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model_pretrained_whole,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4238,0.395326,0.8857,0.899943,0.885494,0.886858
2,0.2603,0.311617,0.9169,0.921539,0.917047,0.917589
3,0.2137,0.282169,0.929,0.929931,0.929109,0.928981
4,0.1883,0.270789,0.9312,0.934023,0.931358,0.931556
5,0.1689,0.256467,0.9362,0.937201,0.93647,0.93611
6,0.1597,0.267777,0.9309,0.933351,0.930871,0.931282
7,0.1488,0.272582,0.9323,0.934309,0.932945,0.931894
8,0.1447,0.247279,0.9422,0.944623,0.942323,0.942457
9,0.1421,0.265452,0.9366,0.93978,0.936683,0.936808
10,0.1377,0.236977,0.9469,0.948278,0.947181,0.946943


TrainOutput(global_step=4382, training_loss=0.17989203114859975, metrics={'train_runtime': 1050.1591, 'train_samples_per_second': 761.789, 'train_steps_per_second': 5.961, 'total_flos': 1.13125578080256e+18, 'train_loss': 0.17989203114859975, 'epoch': 14.0})

In [44]:
student_model_pretrained_whole.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [45]:
trainer.evaluate(test)

{'eval_loss': 0.2267916053533554,
 'eval_accuracy': 0.9489,
 'eval_precision': 0.9489642665551571,
 'eval_recall': 0.9489000000000001,
 'eval_f1': 0.9487862446825236,
 'eval_runtime': 12.5707,
 'eval_samples_per_second': 795.499,
 'eval_steps_per_second': 6.284,
 'epoch': 14.0}

In [46]:
torch.save(student_model_pretrained_whole.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/pretrained-distill.pth")

In [47]:
base.count_parameters(student_model_pretrained_whole)

model size: 8.663MB.
Total Trainable Params: 2236682.


Unnamed: 0,Modules,Parameters
0,mobilenet_v2.conv_stem.first_conv.convolution....,864
1,mobilenet_v2.conv_stem.first_conv.normalizatio...,32
2,mobilenet_v2.conv_stem.first_conv.normalizatio...,32
3,mobilenet_v2.conv_stem.conv_3x3.convolution.we...,288
4,mobilenet_v2.conv_stem.conv_3x3.normalization....,32
...,...,...
153,mobilenet_v2.conv_1x1.convolution.weight,409600
154,mobilenet_v2.conv_1x1.normalization.weight,1280
155,mobilenet_v2.conv_1x1.normalization.bias,1280
156,classifier.weight,12800


In [48]:
base.count_parameters(student_model_pretrained)

model size: 8.663MB.
Total Trainable Params: 12810.


Unnamed: 0,Modules,Parameters
0,classifier.weight,12800
1,classifier.bias,10


In [49]:
cpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7218ade95b40>
self.infer_speed_comp()
  22.01 ms
  1 measurement, 1000 runs , 4 threads


In [50]:
gpu_benchmark = base.BenchMarkRunner(student_model_pretrained_whole, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7218adeaa5f0>
self.infer_speed_comp()
  8.72 ms
  1 measurement, 1000 runs , 4 threads
