In [1]:
# %pip install datasets==2.18.0 transformers onnx onnxruntime -q
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install onnx=1.16.1 # 1.16.1 to avoid dll error
# !pip install onnxruntime

We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task.
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details.

In [2]:
model_name = 'microsoft/xtremedistil-l6-h256-uncased'

In [3]:
#laod the dataset
from datasets import load_dataset
dataset = load_dataset("emotion")

from transformers import AutoTokenizer
#Initialize a Pretrained Tokenizer:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    """
    This function takes as input a batch of examples (usually in dictionary format) and tokenizes the "text" field of each example.
    padding="max_length" ensures that all tokenized sequences are padded to the maximum length (128 tokens in this case).
    truncation=True ensures that text sequences longer than the maximum length (128) are truncated.
    max_length=128 sets the maximum length for tokenized sequences.
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [4]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [5]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [29]:
from transformers import AutoModelForSequenceClassification
# load a pre-trained model for sequence classification from Hugging Face 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
# make a fn to compare logits to labels
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [30]:
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir="test_trainer",
                                  per_device_train_batch_size=128,
                                  num_train_epochs=1,learning_rate=3e-05,
                                  eval_strategy="epoch")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)

In [31]:
trainer.train()

                                                 
100%|██████████| 125/125 [02:43<00:00,  1.31s/it]

{'eval_loss': 1.4835593700408936, 'eval_accuracy': 0.583, 'eval_runtime': 14.8081, 'eval_samples_per_second': 135.062, 'eval_steps_per_second': 16.883, 'epoch': 1.0}
{'train_runtime': 163.1689, 'train_samples_per_second': 98.058, 'train_steps_per_second': 0.766, 'train_loss': 1.607395751953125, 'epoch': 1.0}





TrainOutput(global_step=125, training_loss=1.607395751953125, metrics={'train_runtime': 163.1689, 'train_samples_per_second': 98.058, 'train_steps_per_second': 0.766, 'total_flos': 59061116928000.0, 'train_loss': 1.607395751953125, 'epoch': 1.0})

In [10]:
trainer.evaluate()

  0%|          | 0/250 [00:00<?, ?it/s]

100%|██████████| 250/250 [00:04<00:00, 56.65it/s]


{'eval_loss': 1.440544605255127,
 'eval_accuracy': 0.59,
 'eval_runtime': 4.4392,
 'eval_samples_per_second': 450.531,
 'eval_steps_per_second': 56.316,
 'epoch': 1.0}

Export PyTorch model to ONNX format for serving with ONNX Runtime Web

In [11]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [12]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer,device='cpu')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
model = model.to("cpu")

In [17]:
onnx_convert.convert_pytorch(pipeline, opset=14, output=Path("classifier.onnx"), use_external_format=False)

Using framework PyTorch: 2.4.0+cu118
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [18]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("classifier.onnx", "classifier_int8.onnx",
                 weight_type=QuantType.QUInt8)



Evaluate accuracy using ONNX-Runtime inference - validate PyTorch inference versus ONNX-Runtime

In [19]:
import onnxruntime as ort

In [20]:
session = ort.InferenceSession("classifier.onnx")
session_int8 = ort.InferenceSession("classifier_int8.onnx")

In [21]:
import numpy as np

In [22]:
input_feed = {
    "input_ids": np.array(full_eval_dataset['input_ids']),
    "attention_mask": np.array(full_eval_dataset['attention_mask']),
    "token_type_ids": np.array(full_eval_dataset['token_type_ids'])
}

In [23]:
import numpy as np

# Assuming input_feed is a dictionary with the inputs lets convert them into int64
input_feed_converted = {key: np.array(value, dtype=np.int64) for key, value in input_feed.items()}

# Run the sessions with the converted inputs
out = session.run(input_feed=input_feed_converted, output_names=['output_0'])[0]
out_int8 = session_int8.run(input_feed=input_feed_converted, output_names=['output_0'])[0]


In [24]:
# out = session.run(input_feed=input_feed_converted,output_names=['output_0'])[0]
# out_int8 = session_int8.run(input_feed=input_feed_converted,output_names=['output_0'])[0]

In [25]:
predictions = np.argmax(out, axis=-1)
predictions_int8 = np.argmax(out_int8, axis=-1)

In [26]:
metric.compute(predictions=predictions, references=full_eval_dataset['label'])

{'accuracy': 0.59}

In [27]:
metric.compute(predictions=predictions_int8, references=full_eval_dataset['label'])

{'accuracy': 0.575}