# Frameworks

Links:
- https://huggingface.co/docs/transformers/model_doc/t5
- https://huggingface.co/docs/transformers/serialization#onnx
- https://github.com/NVIDIA/TensorRT/tree/main/demo/HuggingFace


In [None]:
!nvidia-smi

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_MODULE_LOADING"] = "LAZY"

In [None]:
# Helpers
import subprocess as sp
import os
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, set_seed

set_seed(1234)

tokenizer = AutoTokenizer.from_pretrained("t5-small")

input_string = "translate english to french: where can I find the bathroom?"

In [None]:
# PyTorch
with torch.no_grad():
    pytorch_model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    pytorch_tokens = pytorch_model.generate(input_ids, max_new_tokens=16)
    pytorch_str = tokenizer.decode(pytorch_tokens[0], skip_special_tokens=True)
    print(f"Pytorch output: {pytorch_str}")
    
    %timeit pytorch_model.generate(input_ids)

# cleanup
del pytorch_model
del pytorch_tokens
del pytorch_str
torch.cuda.empty_cache()

In [None]:
!optimum-cli export onnx --model=t5-small t5_small_onnx/

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM

onnx_model = ORTModelForSeq2SeqLM.from_pretrained("./t5_small_onnx", provider="CUDAExecutionProvider")

inputs = tokenizer(input_string, return_tensors="pt").to("cuda")
onnx_tokens = onnx_model.generate(**inputs)
onnx_str = tokenizer.decode(onnx_tokens[0], skip_special_tokens=True)
print(f"ONNX output: {onnx_str}")

%timeit -n 25 onnx_model.generate(**inputs)

# cleanup
# del onnx_model
# print_used_memory() # ensure none remains

More detail: https://github.com/NVIDIA/TensorRT/blob/main/demo/HuggingFace/notebooks/t5.ipynb