In [1]:
! pip install tokenizer sentencepiece
! nvidia-smi

[0mWed Mar 15 10:10:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10G         On   | 00000000:00:1E.0 Off |                    0 |
|  0%   28C    P0    42W / 300W |      0MiB / 22731MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Pr

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import time
import torch._dynamo as torchdynamo
import torch
from kernl.model_optimization import optimize_model

In [3]:
# default cache size needs to be increased to store the many graphs with generative models
torchdynamo.config.cache_size_limit = 512

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
input_ids = tokenizer(
    "translate English to French: The house in the woods is wonderful, can we buy it ?",
    return_tensors="pt",
    pad_to_multiple_of=8,
    padding=True,
).to("cuda")

In [5]:
with torch.inference_mode(), torch.autocast(dtype=torch.bfloat16, cache_enabled=True, device_type="cuda"):
    for _ in range(10):
        output = model.generate(
            inputs=input_ids["input_ids"],
            min_length=22,
            max_length=22,
        )
    torch.cuda.synchronize()
    start = time.perf_counter()
    output = model.generate(
        inputs=input_ids["input_ids"],
        min_length=22,
        max_length=22,
    )
    torch.cuda.synchronize()
    latency_baseline = time.perf_counter() - start
    print(latency_baseline)
    print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.1443374139998923
La maison dans les bois est merveilleuse, pouvons-nous l'acheter? 


In [6]:
optimize_model(model.encoder)
optimize_model(model.decoder)

In [7]:
# warmup (IRL, encoder and decoder should be warmed each on their own)
with torch.inference_mode(), torch.autocast(dtype=torch.bfloat16, cache_enabled=True, device_type="cuda"):
    start = time.perf_counter()
    model.generate(inputs=input_ids["input_ids"], min_length=22, max_length=22)
    print(time.perf_counter() - start)

  super(CUDAGraph, self).capture_end()


233.45941423799923


In [8]:
with torch.inference_mode(), torch.autocast(dtype=torch.bfloat16, cache_enabled=True, device_type="cuda"):
    for _ in range(10):
        model.generate(
            inputs=input_ids["input_ids"],
            min_length=22,
            max_length=22,
        )
    torch.cuda.synchronize()
    start = time.perf_counter()
    output = model.generate(
        inputs=input_ids["input_ids"],
        min_length=22,
        max_length=22,
    )
    torch.cuda.synchronize()
    latency_optimized = time.perf_counter() - start
    print(latency_optimized)
    print(f"{latency_baseline/latency_optimized:.1f}x speedup")
    print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.05253051799991226
2.7x speedup
La maison dans les bois est merveilleuse, pouvons-nous l'acheter? 
