In [None]:
# %pip install datasets transformers onnx onnxruntime -q
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install onnx onnxruntime

We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task.
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details.

In [8]:
model_name = 'microsoft/xtremedistil-l6-h256-uncased'

In [9]:
from datasets import load_dataset
dataset = load_dataset("emotion")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [11]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [5]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [10]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128,
                                  num_train_epochs=24,learning_rate=3e-05,
                                  eval_strategy="epoch")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.248516,0.6055
2,No log,0.928577,0.7095
3,No log,0.76504,0.756
4,1.084500,0.663393,0.7945
5,1.084500,0.574755,0.846
6,1.084500,0.51274,0.8685
7,1.084500,0.461245,0.8815
8,0.563000,0.420511,0.89
9,0.563000,0.383724,0.894
10,0.563000,0.354415,0.899


TrainOutput(global_step=3000, training_loss=0.45425546264648436, metrics={'train_runtime': 753.726, 'train_samples_per_second': 509.469, 'train_steps_per_second': 3.98, 'total_flos': 1417466806272000.0, 'train_loss': 0.45425546264648436, 'epoch': 24.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.23133312165737152,
 'eval_accuracy': 0.923,
 'eval_runtime': 3.0208,
 'eval_samples_per_second': 662.067,
 'eval_steps_per_second': 82.758,
 'epoch': 24.0}

Export PyTorch model to ONNX format for serving with ONNX Runtime Web

In [13]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [14]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [15]:
model = model.to("cpu")

In [17]:
onnx_convert.convert_pytorch(pipeline, opset=14, output=Path("classifier.onnx"), use_external_format=False)

Using framework PyTorch: 2.3.0+cu121
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [19]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("classifier.onnx", "classifier_int8.onnx",
                 weight_type=QuantType.QUInt8)



Evaluate accuracy using ONNX-Runtime inference - validate PyTorch inference versus ONNX-Runtime

In [12]:
import onnxruntime as ort

In [13]:
session = ort.InferenceSession("classifier.onnx")
session_int8 = ort.InferenceSession("classifier_int8.onnx")

In [14]:
import numpy as np

In [15]:
input_feed = {
    "input_ids": np.array(full_eval_dataset['input_ids']),
    "attention_mask": np.array(full_eval_dataset['attention_mask']),
    "token_type_ids": np.array(full_eval_dataset['token_type_ids'])
}

In [17]:
import numpy as np

# Assuming input_feed is a dictionary with the inputs lets convert them into int64
input_feed_converted = {key: np.array(value, dtype=np.int64) for key, value in input_feed.items()}

# Run the sessions with the converted inputs
out = session.run(input_feed=input_feed_converted, output_names=['output_0'])[0]
out_int8 = session_int8.run(input_feed=input_feed_converted, output_names=['output_0'])[0]


In [19]:
out = session.run(input_feed=input_feed_converted,output_names=['output_0'])[0]
out_int8 = session_int8.run(input_feed=input_feed_converted,output_names=['output_0'])[0]

In [20]:
predictions = np.argmax(out, axis=-1)
predictions_int8 = np.argmax(out_int8, axis=-1)

In [23]:
metric.compute(predictions=predictions, references=full_eval_dataset['label'])

{'accuracy': 0.923}

In [24]:
metric.compute(predictions=predictions_int8, references=full_eval_dataset['label'])

{'accuracy': 0.7855}