# MODEL PRUNING

In [1]:
from transformers import AutoTokenizer

main_model = "textattack/bert-base-uncased-SST-2"

tokenizer = AutoTokenizer.from_pretrained(main_model)

dataset_id="glue"
dataset_config="sst2"

from datasets import load_dataset

dataset = load_dataset(dataset_id,dataset_config)
dataset



DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [2]:
def process(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, max_length=512
    )
    return tokenized_inputs

tokenized_datasets = dataset.map(process, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label","labels")

tokenized_datasets["test"].features

{'sentence': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [3]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

pruned_model = AutoModelForSequenceClassification.from_pretrained(
    main_model,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)



In [4]:
!pip install torch-pruning

from transformers import AutoTokenizer, BertModel, BertForSequenceClassification
import torch
from transformers.models.bert.modeling_bert import BertSelfAttention
import torch_pruning as tp

#tokenizer = AutoTokenizer.from_pretrained(main_model)
#model = BertModel.from_pretrained(main_model)
#print(model)
hf_inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
example_inputs = {'input_ids': hf_inputs['input_ids'], 'token_type_ids': hf_inputs['token_type_ids'], 'attention_mask': hf_inputs['attention_mask']}

#outputs = model(**example_inputs)
#last_hidden_states = outputs.last_hidden_state

imp = tp.importance.MagnitudeImportance(p=2, group_reduction="mean")
base_macs, base_params = tp.utils.count_ops_and_params(pruned_model, example_inputs)
num_heads = {}

# All heads should be pruned simultaneously, so we group channels by head.
for m in pruned_model.modules():
    if isinstance(m, BertSelfAttention):
        num_heads[m.query] = m.num_attention_heads
        num_heads[m.key] = m.num_attention_heads
        num_heads[m.value] = m.num_attention_heads

pruner = tp.pruner.MetaPruner(
    pruned_model, 
    example_inputs, 
    global_pruning=False, # If False, a uniform pruning ratio will be assigned to different layers.
    importance=imp, # importance criterion for parameter selection
    iterative_steps=1, # the number of iterations to achieve target pruning ratio
    pruning_ratio=0.5,
    num_heads=num_heads,
    prune_head_dims=False,
    prune_num_heads=True,
    head_pruning_ratio=0.5,
    output_transform=lambda out: out.logits.sum(),
    ignored_layers=[pruned_model.bert.pooler],
)

for g in pruner.step(interactive=True):
    #print(g)
    g.prune()

# Modify the attention head size and all head size after pruning
for m in pruned_model.modules():
    if isinstance(m, BertSelfAttention):
        print("Num heads: %d, head size: %d =>"%(m.num_attention_heads, m.attention_head_size))
        m.num_attention_heads = pruner.num_heads[m.query]
        m.attention_head_size = m.query.out_features // m.num_attention_heads
        m.all_head_size = m.query.out_features
        print("Num heads: %d, head size: %d"%(m.num_attention_heads, m.attention_head_size))
        print()
        
for m in pruned_model.modules():
    if isinstance(m, BertForSequenceClassification):
        m.classifier = torch.nn.Linear(m.classifier.in_features, 2)  # Set out_features = 2 for binary classification
        print(f"Modified classifier output to {m.classifier.out_features}")

print(pruned_model)
test_output = pruned_model(**example_inputs)
pruned_macs, pruned_params = tp.utils.count_ops_and_params(pruned_model, example_inputs)
print("Base MACs: %f M, Pruned MACs: %f M"%(base_macs/1e6, pruned_macs/1e6))
print("Base Params: %f M, Pruned Params: %f M"%(base_params/1e6, pruned_params/1e6))

# Calculate the difference in parameters before and after pruning
params_diff = base_params - pruned_params

# Print the comparison
print("Difference in Params: %f M"%(params_diff/1e6))



You should consider upgrading via the '/Users/christophknaden/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Num heads: 12, head size: 64 =>
Num heads: 6, head size: 64

Modified classifier output to 2
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Emb

In [None]:
print(teacher_model)

# Prune Basic Training example

In [15]:

# Evalutation METRICS
%pip install evaluate
%pip install scikit-learn

from evaluate import load
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }


#if torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")
device = torch.device("cpu")
print(f"Using device: {device}")

%pip show torch


training_args = TrainingArguments(
    output_dir="bla",
    num_train_epochs=2,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    fp16=False,
    learning_rate=6e-5,
    seed=33,
    # logging & evaluation strategies
    logging_dir=f"bla/logs",
    logging_strategy="epoch", # to get more information to TB
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard"
)

# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


basic_trainer = Trainer (
    pruned_model,
    training_args,
    train_dataset=tokenized_datasets["train"].select(range(10000)),
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/christophknaden/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/christophknaden/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Using device: cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: torch
Version: 2.5.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /Users/christophknaden/venv/lib/python3.9/site-packages
Requires: filelock, fsspec, typing-extensions, networkx, jinja2, sympy
Required-by: torchvision, torchaudio, torch-pruning, accelerate
Note: you may need to restart the kernel to use updated packages.


  basic_trainer = Trainer (


In [16]:
basic_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5606,0.561419,0.733945
2,0.4018,0.486198,0.77867


TrainOutput(global_step=158, training_loss=0.4812428679647325, metrics={'train_runtime': 211.1762, 'train_samples_per_second': 94.708, 'train_steps_per_second': 0.748, 'total_flos': 128378198111040.0, 'train_loss': 0.4812428679647325, 'epoch': 2.0})

# Add Header and test pruned model

In [17]:
import torch.nn as nn

# Define a simple classification head
class SentimentClassifier(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super(SentimentClassifier, self).__init__()
        self.fc = nn.Linear(hidden_size, num_labels)
    
    def forward(self, x):
        return self.fc(x)

# Perform sentiment analysis
test_sentence = "The movie was bad."
test_inputs = tokenizer(test_sentence, return_tensors="pt")

# Forward pass through the pruned BERT model
with torch.no_grad():
    outputs = pruned_model(**test_inputs, output_hidden_states=True)
    cls_embedding = outputs.hidden_states[-1][:, 0, :]  # Extract [CLS] token embedding

# Create classification head with adjusted hidden size
hidden_size = cls_embedding.shape[-1]#outputs.last_hidden_state.shape[-1]  # Match pruned model's hidden size
num_labels = num_labels  # Binary sentiment classification
classification_head = SentimentClassifier(hidden_size, num_labels)



# Pass through the updated classification head
logits = classification_head(cls_embedding)
probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_class = torch.argmax(probabilities, dim=-1)

# Output sentiment prediction
predicted_probability = probabilities[0, predicted_class].item()  # Get probability of predicted class
predicted_sentiment = 'Positive' if predicted_class.item() == 1 else 'Negative'
print(f"Predicted sentiment: {predicted_sentiment} (Probability: {predicted_probability:.4f})")

RuntimeError: Placeholder storage has not been allocated on MPS device!

Welcome to our end-to-end task-specific knowledge distilattion Text-Classification example using Transformers, PyTorch & Amazon SageMaker. Distillation is the process of training a small "student" to mimic a larger "teacher". In this example, we will use [BERT-base](https://huggingface.co/textattack/bert-base-uncased-SST-2) as Teacher and [BERT-Tiny](https://huggingface.co/google/bert_uncased_L-2_H-128_A-2) as Student. We will use [Text-Classification](https://huggingface.co/tasks/text-classification) as task-specific knowledge distillation task and the [Stanford Sentiment Treebank v2 (SST-2)](https://paperswithcode.com/dataset/sst) dataset for training.


They are two different types of knowledge distillation, the Task-agnostic knowledge distillation (right) and the Task-specific knowledge distillation (left). In this example we are going to use the Task-specific knowledge distillation.

![knowledge-distillation](./imgs/knowledge-distillation.png)
_Task-specific distillation (left) versus task-agnostic distillation (right). Figure from FastFormers by Y. Kim and H. Awadalla [arXiv:2010.13382]._


In Task-specific knowledge distillation a "second step of distillation" is used to "fine-tune" the model on a given dataset. This idea comes from the [DistilBERT paper](https://arxiv.org/pdf/1910.01108.pdf) where it was shown that a student performed better than simply finetuning the distilled language model:

> We also studied whether we could add another step of distillation during the adaptation phase by fine-tuning DistilBERT on SQuAD using a BERT model previously fine-tuned on SQuAD as a teacher for an additional term in the loss (knowledge distillation). In this setting, there are thus two successive steps of distillation, one during the pre-training phase and one during the adaptation phase. In this case, we were able to reach interesting performances given the size of the model:79.8 F1 and 70.4 EM, i.e. within 3 points of the full model.

If you are more interested in those topics you should defintely read:
* [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
* [FastFormers: Highly Efficient Transformer Models for Natural Language Understanding](https://arxiv.org/abs/2010.13382)

Especially the [FastFormers paper](https://arxiv.org/abs/2010.13382) contains great research on what works and doesn't work when using knowledge distillation.

---

Huge thanks to [Lewis Tunstall](https://www.linkedin.com/in/lewis-tunstall/) and his great [Weeknotes: Distilling distilled transformers](https://lewtun.github.io/blog/weeknotes/nlp/huggingface/transformers/2021/01/17/wknotes-distillation-and-generation.html#fn-1)


## Installation

In [None]:
%pip install "pytorch==1.10.1"
%pip install transformers datasets tensorboard --upgrade
#!sudo apt-get install git-lfs
!brew install git-lfs


This example will use the [Hugging Face Hub](https://huggingface.co/models) as remote model versioning service. To be able to push our model to the Hub, you need to register on the [Hugging Face](https://huggingface.co/join).
If you already have an account you can skip this step.
After you have an account, we will use the `notebook_login` util from the `huggingface_hub` package to log into our account and store our token (access key) on the disk.

In [None]:
from huggingface_hub import login

login()  # displays a widget in a notebook, a prompt in terminal otherwise


## Setup & Configuration

In this step we will define global configurations and paramters, which are used across the whole end-to-end fine-tuning proccess, e.g. `teacher` and `studen` we will use.

In this example, we will use [BERT-base](textattack/bert-base-uncased-SST-2) as Teacher and [BERT-Tiny](https://huggingface.co/google/bert_uncased_L-2_H-128_A-2) as Student. Our Teacher is already fine-tuned on our dataset, which makes it easy for us to directly start the distillation training job rather than fine-tuning the teacher first to then distill it afterwards.

_**IMPORTANT**: This example will only work with a `Teacher` & `Student` combination where the Tokenizer is creating the same output._

Additionally, describes the [FastFormers: Highly Efficient Transformer Models for Natural Language Understanding](https://arxiv.org/abs/2010.13382) paper an additional phenomenon.
> In our experiments, we have observed that dis-
tilled models do not work well when distilled to a
different model type. Therefore, we restricted our
setup to avoid distilling RoBERTa model to BERT
or vice versa. The major difference between the
two model groups is the input token (sub-word) em-
bedding. We think that different input embedding
spaces result in different output embedding spaces,
and knowledge transfer with different spaces does
not work well

In [None]:
student_id = "google/bert_uncased_L-2_H-128_A-2"
teacher_id = "textattack/bert-base-uncased-SST-2"

# name for our repository on the hub
repo_name = "tiny-bert-sst2-distilled_combined"

Below are some checks to make sure the `Teacher` & `Student` are creating the same output.

In [None]:
from transformers import AutoTokenizer

# init tokenizer
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

# sample input
sample = "This is a basic example, with different words to test."

# assert results
assert teacher_tokenizer(sample) == student_tokenizer(sample), "Tokenizers haven't created the same output"


## Dataset & Pre-processing

As Dataset we will use the [Stanford Sentiment Treebank v2 (SST-2)](https://paperswithcode.com/dataset/sst) a text-classification for `sentiment-analysis`, which is included in the [GLUE benchmark](https://gluebenchmark.com/). The dataset is based on the dataset introduced by Pang and Lee (2005) and consists of 11,855 single sentences extracted from movie reviews. It was parsed with the Stanford parser and includes a total of 215,154 unique phrases from those parse trees, each annotated by 3 human judges. It uses the two-way (positive/negative) class split, with only sentence-level labels.


In [None]:
dataset_id="glue"
dataset_config="sst2"

To load the `sst2` dataset, we use the `load_dataset()` method from the 🤗 Datasets library.


In [None]:
from datasets import load_dataset

dataset = load_dataset(dataset_id,dataset_config)
dataset

### Pre-processing & Tokenization

To distill our model we need to convert our "Natural Language" to token IDs. This is done by a 🤗 Transformers Tokenizer which will tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary). If you are not sure what this means check out [chapter 6](https://huggingface.co/course/chapter6/1?fw=tf) of the Hugging Face Course.

We are going to use the tokenizer of the `Teacher`, but since both are creating same output you could also go with the `Student` tokenizer.


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(teacher_id)

Additionally we add the `truncation=True` and `max_length=512` to align the length and truncate texts that are bigger than the maximum size allowed by the model.

In [None]:
def process(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, max_length=512
    )
    return tokenized_inputs

tokenized_datasets = dataset.map(process, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label","labels")

tokenized_datasets["test"].features

## Distilling the model using `PyTorch` and `DistillationTrainer`


Now that our `dataset` is processed, we can distill it. Normally, when fine-tuning a transformer model using PyTorch you should go with the `Trainer-API`. The [Trainer](https://huggingface.co/docs/transformers/v4.16.1/en/main_classes/trainer#transformers.Trainer) class provides an API for feature-complete training in PyTorch for most standard use cases.

In our example we cannot use the `Trainer` out-of-the-box, since we need to pass in two models, the `Teacher` and the `Student` and compute the loss for both. But we can subclass the `Trainer` to create a `DistillationTrainer` which will take care of it and only overwrite the [compute_loss](https://github.com/huggingface/transformers/blob/c4ad38e5ac69e6d96116f39df789a2369dd33c21/src/transformers/trainer.py#L1962) method as well as the `init` method. In addition to this we also need to subclass the `TrainingArguments` to include the our distillation hyperparameters.


In [None]:
%%writefile custom_args.py
from transformers import TrainingArguments

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature


In [None]:
import importlib
import custom_args

importlib.reload(custom_args)
from custom_args import DistillationTrainingArguments


In [None]:
%pip install torch torchvision torchaudio
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F

#class DistillationTrainingArguments(TrainingArguments):
#    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
#        super().__init__(*args, **kwargs)
#
#        self.alpha = alpha
#        self.temperature = temperature

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, accelerator=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model

        self.accelerator = accelerator if accelerator else Accelerator()
        # Place teacher model on the correct device using the accelerator
        self.teacher = self.accelerator.prepare(self.teacher)
        
        # place teacher on same device as student
        #self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    # compute student output
    outputs_student = model(**inputs)
    student_loss = outputs_student.loss

    # compute teacher output
    with torch.no_grad():
        outputs_teacher = self.teacher(**inputs)

    # assert size
    assert outputs_student.logits.size() == outputs_teacher.logits.size()

    # Soften probabilities and compute distillation loss
    loss_function = nn.KLDivLoss(reduction="batchmean")
    loss_logits = (loss_function(
        F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
        F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))

    # Return weighted student loss
    loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
    return (loss, outputs_student) if return_outputs else loss


### Hyperparameter Definition, Model Loading

In [None]:
%pip install transformers[torch]
%pip install "accelerate==1.2.1"

from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# define training args
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=2,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    fp16=False,
    learning_rate=6e-5,
    seed=33,
    # logging & evaluation strategies
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch", # to get more information to TB
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters
    alpha=0.5,
    temperature=4.0
    )


# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define model
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# define student model
#student_model = AutoModelForSequenceClassification.from_pretrained(
#    student_id,
#    num_labels=num_labels,
#    id2label=id2label,
#    label2id=label2id,
#)

### Evaluation metric

we can create a `compute_metrics` function to evaluate our model on the test set. This function will be used during the training process to compute the `accuracy` & `f1` of our model.

In [None]:
%pip install evaluate
%pip install scikit-learn

from evaluate import load
import numpy as np

# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }


In [None]:
from transformers import TrainingArguments

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature


## Training

Start training with calling `trainer.train`

In [None]:
#if torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")
device = torch.device("cpu")
print(f"Using device: {device}")

%pip show torch


In [None]:
from accelerate import Accelerator
accelerator = Accelerator(cpu=True)
trainer = DistillationTrainer(
    pruned_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"].select(range(10000)),
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    accelerator=accelerator
)

start training using the `DistillationTrainer`.

In [None]:
trainer.train()

## Hyperparameter Search for Distillation parameter `alpha` & `temperature` with optuna

The parameter `alpha` & `temparature` in the `DistillationTrainer` can also be used when doing Hyperparamter search to maxizime our "knowledge extraction". As Hyperparamter Optimization framework are we using [Optuna](https://optuna.org/), which has a integration into the `Trainer-API`. Since we the `DistillationTrainer` is a sublcass of the `Trainer` we can use the `hyperparameter_search` without any code changes.


In [None]:
%pip install optuna

To do Hyperparameter Optimization using `optuna` we need to define our hyperparameter space. In this example we are trying to optimize/maximize the `num_train_epochs`, `learning_rate`, `alpha` & `temperature` for our `student_model`.

In [None]:
def hp_space(trial):
    return {
      "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 2),
      "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3 ,log=True),
      "alpha": trial.suggest_float("alpha", 0, 1),
      "temperature": trial.suggest_int("temperature", 2, 30),
      }

To start our Hyperparmeter search we just need to call `hyperparameter_search` provide our `hp_space` and number of trials to run.

In [None]:
def student_init():
    return pruned_model
#    return AutoModelForSequenceClassification.from_pretrained(
#        student_id,
#        num_labels=num_labels,
#        id2label=id2label,
#        label2id=label2id
#    )

trainer = DistillationTrainer(
    model_init=student_init,
    args=training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    accelerator=accelerator
)
best_run = trainer.hyperparameter_search(
    n_trials=2,
    direction="maximize",
    hp_space=hp_space
)

print(best_run)

Since optuna is just finding the best hyperparameters we need to fine-tune our model again using the best hyperparamters from the `best_run`.

In [None]:
# overwrite initial hyperparameters with from the best_run
for k,v in best_run.hyperparameters.items():
    setattr(training_args, k, v)

# Define a new repository to store our distilled model
best_model_ckpt = "tiny-bert-best"
training_args.output_dir = best_model_ckpt

We have overwritten the default Hyperparameters with the one from our `best_run` and can start the training now.

In [None]:
# Create a new Trainer with optimal parameters
optimal_trainer = DistillationTrainer(
    pruned_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

optimal_trainer.train()


# save best model, metrics and create model card
trainer.create_model_card(model_name=training_args.hub_model_id)
trainer.push_to_hub()

In [None]:
from huggingface_hub import HfApi

whoami = HfApi().whoami()
username = whoami['name']

print(f"https://huggingface.co/{username}/{repo_name}")

In [None]:
print(pruned_model.config.id2label)


In [None]:
from transformers import pipeline
import torch

# Assuming teacher_model and tokenizer are already loaded

# Initialize the classifier pipeline with CPU device
device = torch.device("cpu")
classifier = pipeline("text-classification", model=teacher_model, tokenizer=tokenizer, device=-1)  # -1 means CPU

# Ensure the model's id2label mapping is correct
teacher_model.config.id2label = {0: 'negative', 1: 'positive'}
teacher_model.config.label2id = {'negative': 0, 'positive': 1}

# Example prompts
prompts = [
    "This movie was amazing!",
    "I hated the ending.",
    "The acting was mediocre.",
    "It was an okay film.",
    "A truly captivating experience!"
]

# Process each prompt and print results
for prompt in prompts:
    # Run classification
    result = classifier(prompt)[0]  # Get the result for the first prompt
    
    # Check the output result
    print(f"Prompt: {prompt}")
    print(f"Raw Result: {result}")
    
    # Ensure the label is correctly mapped
    label = result['label']
    
    # Convert to the correct label if needed (this step may be redundant)
    if isinstance(label, str):  # If it's a string, print it directly
        print(f"Label: {label}, Score: {result['score']}\n")
    else:  # If it's an index, use id2label to convert
        label = teacher_model.config.id2label[label]
        print(f"Label: {label}, Score: {result['score']}\n")


In [None]:
from transformers import pipeline
import torch

# Assuming student_model and tokenizer are already loaded

# Initialize the classifier pipeline with CPU device
device = torch.device("cpu")
classifier = pipeline("text-classification", model=pruned_model, tokenizer=tokenizer, device=-1)  # -1 means CPU

# Ensure the model's id2label mapping is correct
pruned_model.config.id2label = {0: 'negative', 1: 'positive'}
pruned_model.config.label2id = {'negative': 0, 'positive': 1}

# Example prompts
prompts = [
    "This movie was amazing!",
    "I hated the ending.",
    "The acting was mediocre.",
    "It was an okay film.",
    "A truly captivating experience!"
]

# Process each prompt and print results
for prompt in prompts:
    # Run classification
    result = classifier(prompt)[0]  # Get the result for the first prompt
    
    # Check the output result
    print(f"Prompt: {prompt}")
    print(f"Raw Result: {result}")
    
    # Ensure the label is correctly mapped
    label = result['label']
    
    # Convert to the correct label if needed (this step may be redundant)
    if isinstance(label, str):  # If it's a string, print it directly
        print(f"Label: {label}, Score: {result['score']}\n")
    else:  # If it's an index, use id2label to convert
        label = pruned_model.config.id2label[label]
        print(f"Label: {label}, Score: {result['score']}\n")


## Results

We were able to achieve a `accuracy` of 0.8337, which is a very good result for our model. Our distilled `Tiny-Bert` has 96% less parameters than the teacher `bert-base` and runs ~46.5x faster while preserving over 90% of BERT’s performances as measured on the SST2 dataset.

| model | Parameter | Speed-up | Accuracy |
|------------|-----------|----------|----------|
| BERT-base  | 109M      | 1x       | 93%      |
| tiny-BERT  | 4M        | 46.5x    | 83%      |

_Note: The [FastFormers paper](https://arxiv.org/abs/2010.13382) uncovered that the biggest boost in performance is observerd when having 6 or more layers in the student. The [google/bert_uncased_L-2_H-128_A-2](https://huggingface.co/google/bert_uncased_L-2_H-128_A-2) we used only had 2, which means when changing our student to, e.g. `distilbert-base-uncased` we should better performance in terms of accuracy._

If you are now planning to implement and add task-specific knowledge distillation to your models. I suggest to take a look at the [sagemaker-distillation](https://github.com/philschmid/knowledge-distillation-transformers-pytorch-sagemaker/blob/master/sagemaker-distillation.ipynb), which shows how to run task-specific knowledge distillation on Amazon SageMaker. For the example i created a script deriving this notebook to make it as easy as possible to use for you. You only need to define your `teacher_id`, `student_id` as well as your `dataset` config to run task-specific knowledge distillation for `text-classification`.

```python
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={
    'teacher_id':'textattack/bert-base-uncased-SST-2',           
    'student_id':'google/bert_uncased_L-2_H-128_A-2',           
    'dataset_id':'glue',           
    'dataset_config':'sst2',             
    # distillation parameter
    'alpha': 0.5,
    'temparature': 4,
    # hpo parameter
    "run_hpo": True,
    "n_trials": 100,            
}

# create the Estimator
huggingface_estimator = HuggingFace(..., hyperparameters=hyperparameters)

# start knwonledge distillation training
huggingface_estimator.fit()
```