# Transformers inference acceleration

> 以基于bert的意图识别模型为例

In [2]:
import numpy as np
from pathlib import Path
from transformers import pipeline, AutoModelForSequenceClassification
from time import perf_counter
from datasets import load_dataset
import evaluate, torch

bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

query = """Hey, I'd like to rent a vehicle from Nov 1st to Nov 15th
in
Paris and I need a 15 passenger van"""
result = pipe(query)
print(result)


class PerformanceBenchmark:

    def __init__(self, pipeline, dataset, optim_type="BERTbaseline"):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type

    def compute_accuracy(self):
        preds, labels = [], []
        for example in self.dataset:
            pred = self.pipeline(example["text"])[0]["label"]
        label = example["intent"]
        preds.append(intents.str2int(pred))
        labels.append(label)
        accuracy = accuracy_score.compute(predictions=preds,
                                          references=labels)
        print(f"Accuracy on test set - {accuracy['accuracy']:.3f}")
        return accuracy

    def compute_size(self):
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
        # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}

    def time_pipeline(self, query="What is the pin number for myaccount?"):
        latencies = []
        # Warmup
        for _ in range(10):
            _ = self.pipeline(query)
        # Timed run
        for _ in range(100):
            start_time = perf_counter()
        _ = self.pipeline(query)
        latency = perf_counter() - start_time
        latencies.append(latency)
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"Average latency (ms) - {time_avg_ms:.2f} +\-{time_std_ms: .2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics


clinc = load_dataset("clinc_oos", "plus")

sample = clinc["test"][42]
print(sample)

intents = clinc["test"].features["intent"]
intents.int2str(sample["intent"])

accuracy_score = evaluate.load("accuracy")
print(accuracy_score)

list(pipe.model.state_dict().items())[42]
# torch.save(pipe.model.state_dict(), "model.pt")

pb = PerformanceBenchmark(pipe, clinc['test'])
perf_metrics = pb.run_benchmark()
print(perf_metrics)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'car_rental', 'score': 0.5490034818649292}]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'text': 'transfer $100 from my checking to saving account', 'intent': 133}
EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 

KeyboardInterrupt: 

In [3]:
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting scikit-learn
  Downloading https://mirrors.aliyun.com/pypi/packages/a4/62/92e9cec3deca8b45abf62dd8f6469d688b3f28b9c170809fcc46f110b523/scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading https://mirrors.aliyun.com/pypi/packages/93/4a/50c436de1353cce8b66b26e49a687f10b91fe7465bf34e4565d810153003/scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl (28.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.1.1 (from scikit-learn)
  Downloading https://mirrors.aliyun.com/pypi/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threa

## Knowledge distillation 知识蒸馏

核心思路是：在真实标签的基础上，再加上一份来自教师模型的“软概率分布”。这份分布可以给学生模型提供额外的信息。

### Knowledge distillation for Fine-Tuning

举个例子，比如我们有个 BERT-base 分类器（老师），它对多个意图都给出了较高的概率。这可能说明这些意图在特征空间中彼此靠得很近。那我们就可以训练学生模型去“模仿”这些概率分布。

通过这种方式，我们希望把老师模型里学到的一些**“暗知识”（dark knowledge）**提炼出来给学生。所谓“暗知识”就是那些单靠真实标签是学不到的信息。
<br/><br/>
<img src="./imgs/student_teacher_soften_max.png" width="400"/>
<br/><br/>

*KL(Kullback–Leibler)* : 衡量学生老师模型分别生成的logits 概率差异，计算公式如下：
<br/><br/>

$D_KL(p, q) = \sum_{i}pi(x)log\frac{p_i(x)}{q_i(x)}$

<br/><br/>可用于定义知识蒸馏的loss <br/><br/>
$L_{KD} = T^2D_{KL}$

在知识蒸馏里我们会用温度 T 来“软化”老师模型的输出概率，但这样会让梯度变小（大概变成原来的 1/T²），所以我们引入一个 T² 的归一化系数来把梯度调回合适的范围。温度 T 越高，
softmax 输出的概率分布就越“平”，反之越尖锐，而这个平滑度直接影响训练时梯度的大小，所以引入 T² 是为了让蒸馏过程中的训练更稳定有效。
<br/><br/>
以分类任务为例，只是蒸馏的全过程如下: <br/><br/>
<img src="./imgs/knowledge_distill_classify_task.png" width="400"/>

### Knowledge distillation for PreTraining


In [14]:
#Creating a Knowledge Distillation Trainer
from transformers import TrainingArguments
print(torch.backends.mps.is_available())

class DistillationTrainingArguments(TrainingArguments):

    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature


import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer
import json


class DistillationTrainer(Trainer):

    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        print("inputs", inputs)
        outputs_stu = model(**inputs)
        # extract cross-entropy loss and logits from student

        loss_ce = outputs_stu.loss
        logits_su = outputs_stu.logits

        #extract logits from teacher
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            logits_teacher = outputs_teacher.logits

        # soften probs and compute distillation loss
        loss_fct = nn.KLDivLoss(reduction="batchmean")  # 使用KL公式计算diff
        loss_kd = self.args.temperature ** 2 * loss_fct(F.log_softmax(logits_su / self.args.temperature, dim=-1),
                                                        F.log_softmax(logits_teacher / self.args.temperature, dim=-1))
        loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd

        return (loss, outputs_stu) if return_outputs else loss


from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

stu_ckpt = "distilbert/distilbert-base-uncased"
stu_tokenizer = AutoTokenizer.from_pretrained(stu_ckpt)


def tokenize_text(batch):
    return stu_tokenizer(batch["text"], truncation=True)


clic_enc = clinc.map(tokenize_text, batched=True, remove_columns=["text"])
print('clic_enc:', clic_enc)
clic_enc = clic_enc.rename_column("intent", "labels")


def compute_metrics(pred):
    preds, labels = pred
    preds = np.argmax(preds, axis=-1)
    return accuracy_score.compute(predictions=preds, references=labels)


batch_size = 48
finetuned_ckpt = "../models/distilbert-base-uncased-finetuned-clinc"
student_training_args = DistillationTrainingArguments(
    output_dir=finetuned_ckpt, evaluation_strategy="epoch",
    num_train_epochs=5, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1,
    weight_decay=0.01, no_cuda=True,
    push_to_hub=False)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id
print("intents:", clinc["test"].features['intent'].num_classes)
num_labels = intents.num_classes
stu_config = (AutoConfig.from_pretrained(stu_ckpt, num_labels=num_labels, label2id=label2id, id2label=id2label))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

def stu_init():
    return AutoModelForSequenceClassification.from_pretrained(stu_ckpt, config=stu_config).to(device)


# fine tune
teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher_model = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

distillbert_trainer = DistillationTrainer(model_init=stu_init, teacher_model=teacher_model, args=student_training_args,
                                          train_dataset=clic_enc['train'], eval_dataset=clic_enc['validation'], 
                                          compute_metrics=compute_metrics, tokenizer=stu_tokenizer)

print('training begin...........')
distillbert_trainer.train()

True
clic_enc: DatasetDict({
    train: Dataset({
        features: ['intent', 'input_ids', 'attention_mask'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['intent', 'input_ids', 'attention_mask'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['intent', 'input_ids', 'attention_mask'],
        num_rows: 5500
    })
})
intents: 151
device: cpu


  super().__init__(*args, **kwargs)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training begin...........
inputs {'labels': tensor([124,  58,   2,  68, 109, 133, 106, 148,  94,  50,  10,  49,  95,  99,
        140, 142,  23, 135,  50,  79, 127,  56,  76,   6,   1, 148, 143,  94,
        105, 103,  30,  42,  33, 106,  85,   5,  21,  64,  58, 133,   5, 149,
         23,  93, 110, 130, 107, 147]), 'input_ids': tensor([[  101,  4067,  2017,  ...,     0,     0,     0],
        [  101,  2425,  1996,  ...,     0,     0,     0],
        [  101,  1045,  2123,  ...,     0,     0,     0],
        ...,
        [  101,  2017,  2064,  ...,     0,     0,     0],
        [  101,  2008,  2097,  ...,     0,     0,     0],
        [  101,  3531, 20703,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,,0.722903
2,0.000000,,0.85129
3,0.000000,,0.891613


inputs {'labels': tensor([ 27, 148,  20,  67, 127, 100,  11,  94,  99,  42,  80,  58, 103,  95,
         42, 112,  35, 141,  79, 128,  42, 127, 137,   1,  45,  65,  50,  31,
        137,  68,  84,  83,  72,  77,  20,  43,  14, 102, 105, 121,  79,  52,
        137, 104,  69, 131,  17,  68]), 'input_ids': tensor([[  101,  2064,  2017,  2425,  2033,  2054,  1996,  3815,  2006,  2026,
          3622,  2615,  3021,  2003,   102,     0,     0,     0,     0],
        [  101,  2129,  2172, 10885,  2031,  1045,  2525,  2109,  2023,  2095,
           102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2071,  2017, 13184,  2026,  4070,  2085,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 17542,  1996, 11079,  2012,  9724,  3871,  2005,  6838,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  2342,  2000,  2344,  2062, 14148,  2005,  2026, 18833,
        

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


inputs {'labels': tensor([ 96,   9,  42,  56,  96,  15,  93,  88,  69,  67, 105,  23, 143,  46,
        147,  19,  24,  78, 123,  14,  51,  49, 106, 146, 110,  14,  43,  78,
        113, 136, 112, 148,  88, 120, 146,  18,  89,  29, 129, 136,   8, 122,
         67,   2,   3,  37,  97,  81]), 'input_ids': tensor([[  101,  2040,  2719,  2017,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2515,  6901,  3748,  4777,  2202, 17829,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2129,  2146,  2079,  7318, 15210,  2202,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2006,  2054,  2154,  2079,  1045,  2031,  2000,  3477,  2026,
         13926, 15687,  3021,   102,     0,     0,     0,     0,     0,     0],
        [  101,  2040,  3833,  2017,   102,     0,     0,  

KeyboardInterrupt: 

## Quantization 量化

## Pruning 剪枝

## Graph optimization 图优化