In [None]:
import torch
import torch.utils.benchmark as benchmark
import transformers

t5_cpu = transformers.T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask", resume_download=True)
t5_cuda = transformers.T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask", resume_download=True).cuda()

In [None]:
input_cpu = torch.randint(high=30000, size=(1, 32), dtype=torch.int64)
input_cuda = torch.randint(high=30000, size=(1, 32), dtype=torch.int64).cuda()

t5_cpu.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=20)
t5_cuda.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM


t5_ort_cpu = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                   from_transformers=True,
                                                   provider="CPUExecutionProvider")

t5_ort_cuda = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                   from_transformers=True,
                                                   provider="CUDAExecutionProvider")
t5_ort_cpu.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=20)
t5_ort_cuda.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM

t5_ort_trt = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                      from_transformers=True,
                                      provider="TensorrtExecutionProvider")
t5_ort_trt.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from tqdm.auto import tqdm

results = []
for seq_len in tqdm([2, 8, 32, 64]):
    input_cpu = torch.randint(high=30000, size=(1, seq_len), dtype=torch.int64)
    input_cuda = torch.randint(high=30000, size=(1, seq_len), dtype=torch.int64).cuda()
    for model, description in tqdm(zip([t5_cpu, t5_cuda, t5_ort_cpu, t5_ort_cuda, t5_ort_trt],
                                  ["PyTorch (CPU)", "PyTorch (CUDA)", "ORT (CPU)", "ORT (CUDA)", "ORT-TRT"])):
        input_tensor = input_cpu if "CPU" in description else input_cuda
        model.generate(input_tensor, do_sample=True, num_beams=4, max_new_tokens=20)  # warmup
        results.append(benchmark.Timer(
            stmt="model.generate(input_tensor, do_sample=True, num_beams=4, max_new_tokens=20)",
            globals={'model': model, 'input_tensor': input_tensor},
            num_threads=8,
            label="label",
            sub_label=f"seq_len={seq_len}",
            description=description,
        ).blocked_autorange(min_run_time=10))
compare = benchmark.Compare(results)
compare.colorize()
compare.print()