In [2]:
import optuna

def objective(trial):
    total_epochs = 30
    for epoch in range(total_epochs):
        val_loss = (trial.suggest_float("x", 0, 10) - 2) ** 2 / (epoch + 1)
        trial.report(val_loss, step=epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
    return val_loss

pruner = optuna.pruners.HyperbandPruner(
    min_resource=2, 
    max_resource=30, 
    reduction_factor=3,
    bootstrap_count=2
)

study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=200)
optuna.logging.set_verbosity(optuna.logging.DEBUG)
optuna.logging.enable_propagation() 
for trial in study.trials:
    print(f"Trial {trial.number}: Rungs = {trial.system_attrs}, Intermediate Values = {trial.intermediate_values}")


[I 2025-01-04 19:01:18,996] A new study created in memory with name: no-name-3d77e9b8-cc6d-45a3-a019-3422d8c82f77
[D 2025-01-04 19:01:18,998] Hyperband has 3 brackets
[D 2025-01-04 19:01:18,999] 0th bracket is selected
[D 2025-01-04 19:01:18,999] 0th bracket is selected
[D 2025-01-04 19:01:19,000] 0th bracket is selected
[I 2025-01-04 19:01:19,001] Trial 0 pruned. 
[D 2025-01-04 19:01:19,001] 1th bracket is selected
[D 2025-01-04 19:01:19,002] 1th bracket is selected
[D 2025-01-04 19:01:19,002] 1th bracket is selected
[D 2025-01-04 19:01:19,003] 1th bracket is selected
[D 2025-01-04 19:01:19,003] 1th bracket is selected
[D 2025-01-04 19:01:19,003] 1th bracket is selected
[D 2025-01-04 19:01:19,004] 1th bracket is selected
[I 2025-01-04 19:01:19,004] Trial 1 pruned. 
[D 2025-01-04 19:01:19,005] 1th bracket is selected
[D 2025-01-04 19:01:19,005] 1th bracket is selected
[D 2025-01-04 19:01:19,006] 1th bracket is selected
[D 2025-01-04 19:01:19,006] 1th bracket is selected
[D 2025-01-04 1

Trial 0: Rungs = {'completed_rung_0': 6.012135160800031}, Intermediate Values = {0: 18.03640548240009, 1: 9.018202741200046, 2: 6.012135160800031}
Trial 1: Rungs = {'completed_rung_0': 6.933282990494645}, Intermediate Values = {0: 48.53298093346251, 1: 24.266490466731256, 2: 16.17766031115417, 3: 12.133245233365628, 4: 9.706596186692503, 5: 8.088830155577085, 6: 6.933282990494645}
Trial 2: Rungs = {'completed_rung_0': 1.0036679246114466}, Intermediate Values = {0: 7.025675472280127, 1: 3.5128377361400633, 2: 2.3418918240933757, 3: 1.7564188680700317, 4: 1.4051350944560252, 5: 1.1709459120466879, 6: 1.0036679246114466}
Trial 3: Rungs = {'completed_rung_0': 0.005282862501171592}, Intermediate Values = {0: 0.10037438752226024, 1: 0.05018719376113012, 2: 0.03345812917408675, 3: 0.02509359688056506, 4: 0.020074877504452047, 5: 0.016729064587043374, 6: 0.01433919821746575, 7: 0.01254679844028253, 8: 0.011152709724695583, 9: 0.010037438752226024, 10: 0.009124944320205476, 11: 0.00836453229352

In [7]:
%pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Obtaining dependency information for accelerate>=0.26.0 from https://files.pythonhosted.org/packages/2c/92/48aec3736ca778ffe5fa68e19e3c18917cba4de43fa46fe6176cccafe267/accelerate-1.0.1-py3-none-any.whl.metadata
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import notebook_login, HfFolder
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from evaluate import load
import numpy as np
from torch.utils.data import DataLoader
import evaluate

2024-11-11 20:14:18.172618: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-11 20:14:18.200435: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from datasets import load_dataset

dataset = load_dataset("beans")

In [3]:
from transformers import AutoImageProcessor
teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224")

def process(examples):
    processed_inputs = teacher_processor(examples["image"])
    return processed_inputs

processed_datasets = dataset.map(process, batched=True)

In [4]:
print(processed_datasets["train"])

Dataset({
    features: ['image_file_path', 'image', 'labels', 'pixel_values'],
    num_rows: 1034
})


In [4]:
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F


class ImageDistilTrainer(Trainer):
    def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None,  *args, **kwargs):
        super().__init__(model=student_model, *args, **kwargs)
        self.teacher = teacher_model
        self.student = student_model
        self.loss_function = nn.KLDivLoss(reduction="batchmean")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.teacher.to(device)
        self.teacher.eval()
        self.temperature = temperature
        self.lambda_param = lambda_param



    def compute_loss(self, student, inputs, return_outputs=False):
        student_output = self.student(**inputs)

        with torch.no_grad():
          teacher_output = self.teacher(**inputs)

        # Compute soft targets for teacher and student
        soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
        soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)

        # Compute the loss
        distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)

        # Compute the true label loss
        student_target_loss = student_output.loss

        # Calculate final loss
        loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
        return (loss, student_output) if return_outputs else loss

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
repo_name = "DistilationTest"
training_args = TrainingArguments(
    output_dir="my-awesome-model",
    num_train_epochs=30,
    fp16=True,
    learning_rate=5e-5,
    seed=42,
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    )

num_labels = len(processed_datasets["train"].features["labels"].names)

# initialize models
teacher_model = AutoModelForImageClassification.from_pretrained(
    "merve/beans-vit-224",
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)

# training MobileNetV2 from scratch
student_config = MobileNetV2Config()
student_config.num_labels = num_labels
student_model = MobileNetV2ForImageClassification(student_config)

In [16]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1))
    return {"accuracy": acc["accuracy"]}

In [17]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()
trainer = ImageDistilTrainer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    data_collator=data_collator,
    tokenizer=teacher_processor,
    compute_metrics=compute_metrics,
    temperature = 5,
    lambda_param = 0.5
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.883,0.779409,0.406015
2,0.8415,0.756176,0.511278
3,0.7763,0.70821,0.56391
4,0.7487,0.691835,0.473684
5,0.682,0.645402,0.601504
6,0.6323,0.631559,0.571429
7,0.614,0.596645,0.661654
8,0.5677,0.598456,0.586466
9,0.5386,0.601588,0.601504
10,0.5164,0.624563,0.616541


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=510, training_loss=0.4610541250191483, metrics={'train_runtime': 2622.7706, 'train_samples_per_second': 11.827, 'train_steps_per_second': 0.194, 'total_flos': 6.24122679527424e+16, 'train_loss': 0.4610541250191483, 'epoch': 30.0})

In [19]:
student_model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [20]:
trainer.evaluate(processed_datasets["test"])

{'eval_loss': 0.5066142082214355,
 'eval_accuracy': 0.6875,
 'eval_runtime': 9.7214,
 'eval_samples_per_second': 13.167,
 'eval_steps_per_second': 0.206,
 'epoch': 30.0}

In [25]:
base_config = MobileNetV2Config()
base_config.num_labels = num_labels
base_model = MobileNetV2ForImageClassification(base_config)
repo_name = "DistilationTestComparison"
argsv2 = TrainingArguments(
    output_dir="my-awesome-model-base-v2",
    logging_strategy="epoch",
    report_to="tensorboard",
    eval_strategy="epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    num_train_epochs=30,
    fp16=True,
    seed=42,
    learning_rate = 5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_dir=f"{repo_name}/logs",
)
trainer_base = Trainer(
    base_model,
    argsv2,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    compute_metrics=compute_metrics,
)

In [26]:
trainer_base.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2578,1.105976,0.345865
2,1.1867,1.080333,0.398496
3,1.1758,1.079312,0.398496
4,1.1529,1.048753,0.406015
5,1.0698,1.016128,0.503759
6,1.0252,0.972632,0.548872
7,0.9349,1.025278,0.593985
8,0.9105,1.022724,0.56391
9,0.8164,1.049483,0.601504
10,0.7062,1.254436,0.654135


TrainOutput(global_step=510, training_loss=0.4669638855784547, metrics={'train_runtime': 2060.3369, 'train_samples_per_second': 15.056, 'train_steps_per_second': 0.248, 'total_flos': 6.24122679527424e+16, 'train_loss': 0.4669638855784547, 'epoch': 30.0})

In [27]:
base_model.eval()

MobileNetV2ForImageClassification(
  (mobilenet_v2): MobileNetV2Model(
    (conv_stem): MobileNetV2Stem(
      (first_conv): MobileNetV2ConvLayer(
        (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (conv_3x3): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
        (normalization): BatchNorm2d(32, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
        (activation): ReLU6()
      )
      (reduce_1x1): MobileNetV2ConvLayer(
        (convolution): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (normalization): BatchNorm2d(16, eps=0.001, momentum=0.997, affine=True, track_running_stats=True)
      )
    )
    (layer): ModuleList(
      (0): MobileNetV2InvertedResidual(
        (expand_1x1): MobileNe

In [28]:
trainer_base.evaluate(processed_datasets["test"])

{'eval_loss': 1.299200177192688,
 'eval_accuracy': 0.59375,
 'eval_runtime': 7.5802,
 'eval_samples_per_second': 16.886,
 'eval_steps_per_second': 0.264,
 'epoch': 30.0}