## HuggingFace to ONNX

<img src= "https://frenzy86.s3.eu-west-2.amazonaws.com/python/huggingonnx.png" width=1000>


reference to this article:
https://www.philschmid.de/optimizing-transformers-with-optimum

In [1]:
!pip install transformers[onnx] torch -q
!pip install onnx -q
!pip install accelerate -U -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.8/455.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import warnings
warnings.filterwarnings('ignore')

transformers.onnx enables you to convert model checkpoints to an ONNX graph by leveraging configuration objects. That way you don’t have to provide the complex configuration for dynamic_axes etc.

## From PyTorch--(complicated)

In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# load model and tokenizer
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dummy_model_input = tokenizer("This is a sample", return_tensors="pt")

# export
torch.onnx.export(
                    model,
                    tuple(dummy_model_input.values()),
                    f="torch-model.onnx",
                    input_names=['input_ids', 'attention_mask'],
                    output_names=['logits'],
                    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                                'attention_mask': {0: 'batch_size', 1: 'sequence'},
                                'logits': {0: 'batch_size', 1: 'sequence'}},
                    do_constant_folding=True,
                    opset_version=13,
                    )


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Exporting our checkpoint with the transformers.onnx (all in one with tokenizer)

In [4]:
from pathlib import Path
import transformers
from transformers.onnx import FeaturesManager
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

# load model and tokenizer
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# load config
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model)
onnx_config = model_onnx_config(model.config)

# export
transformers.onnx.export(
                        preprocessor=tokenizer,
                        model=model,
                        config=onnx_config,
                        opset=13,
                        output=Path("modelwithtokenizer1.onnx")
                        )

(['input_ids', 'attention_mask'], ['last_hidden_state'])

## mini inference

In [6]:
import onnxruntime
import numpy as np

onnx_model_path = "modelwithtokenizer1.onnx"
session = onnxruntime.InferenceSession(onnx_model_path)

text = "This is an example sentence for inference."

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt")
# Get model inputs
inputs_onnx = {k: v.numpy() for k, v in inputs.items()}
outputs_onnx = session.run(None, inputs_onnx)
print(np.argmax(outputs_onnx[0]))

1


# Export with Optimum (transformer pipeline)
Optimum Inference includes methods to convert vanilla Transformers models to ONNX using the ORTModelForXxx classes. To convert your Transformers model to ONNX you simply have to pass from_transformers=True to the from_pretrained() method and your model will be loaded and converted to ONNX leveraging the transformers.onnx package under the hood.

In [7]:
!pip install optimum[onnxruntime] -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/453.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.7/453.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The best part about the conversion with Optimum is that you can immediately use the model to run predictions or load it inside a pipeline.

In [8]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForSequenceClassification.from_pretrained(
                                                           model_id, # onnx checkpoint
                                                           export=True
                                                           )

# save onnx checkpoint and tokenizer
onnx_path = Path("test")
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.

***** Exporting submodel 1/1: DistilBertForSequenceClassification *****
Using framework PyTorch: 2.4.1+cu121


('test/tokenizer_config.json',
 'test/special_tokens_map.json',
 'test/vocab.txt',
 'test/added_tokens.json',
 'test/tokenizer.json')

In [10]:
from transformers import pipeline

vanilla_clf = pipeline("text-classification", model=model, tokenizer=tokenizer)
vanilla_clf("Could you assist me in finding my lost card?")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9986515641212463}]

<img src= "https://frenzy86.s3.eu-west-2.amazonaws.com/python/safetensor.png" width=1000>

# Safetensors: a simple, safe and faster way to store and distribute tensors.

Safetensors is a new simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy). Safetensors is really fast

safetensors and ONNX serve different purposessafetensors is a simple, safe, and fast file format for storing and loading tensors. It is a secure alternative to Python’s pickle utility, which is not secure and may contain malicious code that can be executed.

In summary, safetensors is used for storing and loading tensors in a safe and fast way, while ONNX is used for sharing models between different deep learning frameworks. Same applies for other model sharing frameworks.

https://medium.com/@mandalsouvik/safetensors-a-simple-and-safe-way-to-store-and-distribute-tensors-d9ba1931ba04

In [None]:
import torch
from safetensors.torch import save_file

tensors = {
            "embedding": torch.zeros((2, 2)),
            "attention": torch.zeros((2, 3))
            }
save_file(tensors, "model.safetensors")

In [None]:
from safetensors import safe_open

tensors = {}
with safe_open("model.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k) # loads the full tensor given a key
print(tensors)

{'attention': tensor([[0., 0., 0.],
        [0., 0., 0.]], device='cuda:0'), 'embedding': tensor([[0., 0.],
        [0., 0.]], device='cuda:0')}


## Lazy loading

Lazy loading is the ability to load only some tensors, or part of tensors for a given file. This is possible with safetensors.

Lazy loading is really important in cases when we have a large file containing many key and value pairs. This can be a metadata cache for large dataset. If we can load the value for single keys indivisually it will be memory efficient and faster else we will have to load the full file into memory to inspect any of the key.

In [None]:
#do not run
# from safetensors.torch import load_model, save_model

# save_model(model, "model1.safetensors")
# # Instead of save_file(model.state_dict(), "model.safetensors")

# load_model(model, "model1.safetensors")
# # Instead of model.load_state_dict(load_file("model.safetensors"))

##PyTorch Operations
Load a model in PyTorch.



In [None]:
from torchvision.models import resnet18

model_pt = resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 162MB/s]


Save the state_dict to safetensor, and load them back to a new model.



In [None]:
from safetensors.torch import load_model, save_model

# save the state dict
save_model(model_pt, "resnet18.safetensors")

# load the model without weights
model_st = resnet18(pretrained=False)
load_model(model_st, "resnet18.safetensors")

(set(), [])