In [1]:
! pip install tokenizer sentencepiece
! nvidia-smi

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Wed Oct 12 09:00:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.85.02    Driver Version: 510.85.02    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:03:00.0  On |                  N/A |
| 36%   44C    P5    45W / 350W |    313MiB / 24576MiB |     29%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import time
import torchdynamo
import torch
from typing import List
from kernl.optimizer.dynamo_backend import dynamo_backend_ofi
from kernl.implementations.cuda_graph import cuda_graphs_wrapper

In [3]:
# default cache size needs to be increased to store the many graphs with generative models
torchdynamo.config.cache_size_limit = 512

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
input_ids = tokenizer(
    "Translate in English: c'est beaucoup plus rapide avec ces optimisations !",
    return_tensors="pt",
    pad_to_multiple_of=8,
    padding=True,
).to("cuda")

In [5]:
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
    for _ in range(3):
        output = model.generate(
            inputs=input_ids["input_ids"],
            min_length=100,
            max_length=100,
        )
    start = time.time()
    output = model.generate(
        inputs=input_ids["input_ids"],
        min_length=100,
        max_length=100,
    )
    print(time.time() - start)
    print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.7469151020050049
c'est beaucoup plus rapide avec ces optimisations!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 


In [6]:
from torchdynamo.optimizations import BACKENDS

model.encoder.forward2 = model.encoder.forward
model.decoder.forward2 = model.decoder.forward


def compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
    compiled = BACKENDS["cudagraphs"](gm, example_inputs)
    return compiled


def run_encoder(*args, **kwargs):
    with torchdynamo.optimize(compiler):
        return model.encoder.forward2(*args, **kwargs)


def run_decoder(*args, **kwargs):
    with torchdynamo.optimize(compiler):
        return model.decoder.forward2(*args, **kwargs)

In [7]:
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
    out1 = run_decoder(**input_ids)
    out2 = model.decoder(**input_ids)
    assert torch.allclose(out1.last_hidden_state, out2.last_hidden_state, atol=1e-1)
    out3 = run_encoder(**input_ids)
    out4 = model.encoder(**input_ids)
    assert torch.allclose(out3.last_hidden_state, out4.last_hidden_state, atol=1e-1)

In [8]:
model.encoder.forward = run_encoder
model.decoder.forward = run_decoder

In [9]:
# warmup
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
    start = time.time()
    model.generate(
        inputs=input_ids["input_ids"],
        min_length=100,
        max_length=100,
    )
    print(time.time() - start)

341.7478229999542


In [10]:
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
    start = time.time()
    output = model.generate(
        inputs=input_ids["input_ids"],
        min_length=100,
        max_length=100,
    )
    print(time.time() - start)
    print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.178056001663208
c'est beaucoup plus rapide avec ces optimisations!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
