## Install all requirements.

In [1]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
You should consider upgrading via the '/4tb_first/kalgin/projects/demo-gpt-j-6B-tensorrt-int8/.venv/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

## Define seq2seq model.
### All important information with TensorRT initialization you can find in `utils/trt_model.py`.

All our engines have 57 inputs and 57 outputs (1+2\*n_attn_layers). First input is the list of tokens ids while other inputs are context. If you want to send empty context you should create empty tensor with the shape -> (1, 16, **0**, 256) for every context input. First output is the logits while other outputs are context. Our engines always return only new values of the context, so before send context to the next step you have to manually concatenate previous context values with new context values (see code snippet below).

**Inputs names and shapes**:

| | Name | Shape
|---:|:-------------|:-----------|
| 1 | `input_ids` | `(1, sequence_length)` |
| 2 | `history_key_0` | `(1, 16, history_length, 256)` |
| 3 | `history_value_0` | `(1, 16, history_length, 256)` |
| 4 | `history_key_1` | `(1, 16, history_length, 256)` |
| 5 | `history_value_1` | `(1, 16, history_length, 256)` |
| ... |
| 56 | `history_key_27` | `(1, 16, history_length, 256)` |
| 57 | `history_value_27` | `(1, 16, history_length, 256)` |

**Outputs names and shapes**:

| | Name | Shape
|---:|:-------------|:-----------|
| 1 | `logits` | `(1, sequance_length, 50400)` |
| 2 | `out_history_key_0` | `(1, 16, sequence_length, 256)` |
| 3 | `out_history_value_0` | `(1, 16, sequence_length, 256)` |
| 4 | `out_history_key_1` | `(1, 16, sequence_length, 256)` |
| 5 | `out_history_value_1` | `(1, 16, sequence_length, 256)` |
| ... |
| 56 | `out_history_key_27` | `(1, 16, sequence_length, 256)` |
| 57 | `out_history_value_27` | `(1, 16, sequence_length, 256)` |


`sequence_length` - dynamic axis which value must be in the following range \[1, 512\]

`history_length` - dynamic axis which value must be in the following range \[0, 512\]

In [1]:
import torch

from pathlib import Path
from typing import Union

from utils.trt_model import TrtModel


class TrtSeq2SeqModel:
    def __init__(self, path_to_engine: Union[Path, str]):
        self._model = TrtModel(str(path_to_engine))

    @property
    def batch_size(self) -> int:
        return self._model.binding_shape('input_ids')[0]

    def generate(self, input_ids: torch.Tensor, generate_len: int, return_logit: bool = False) -> torch.Tensor:
        input_ids = input_ids.contiguous()

        input_tensors = {'input_ids': input_ids}
        for name in self._model.inputs:
            if name.startswith('history') and 'profile' not in name:
                # add empty context for the first run
                input_tensors[name] = torch.empty(
                    size=(self.batch_size, 16, 0, 256),
                    dtype=self._model.binding_dtype(name),
                    device='cuda',
                )

        result = []
        output_tensors = None
        for _ in range(generate_len):
            output_tensors = self._model.run(input_tensors=input_tensors, output_tensors_cache=output_tensors)

            logits = output_tensors['logits']
            next_id = logits[:, -1, :].argmax(dim=-1, keepdims=True).to(torch.int32)
            result.append(logits.clone() if return_logit else next_id)

            # concatenate previous context values with new context values
            input_tensors['input_ids'] = next_id
            for name, new_value in output_tensors.items():
                if name.startswith('out_history_') and 'profile' not in name:
                    name = name[4:]
                    input_tensors[name] = torch.cat((input_tensors[name], new_value), dim=-2)

            model._model.context.set_optimization_profile_async(1)

        dim = -2 if return_logit else -1
        result = torch.cat(result, dim=dim)

        return result

  from .autonotebook import tqdm as notebook_tqdm


## Download prebuilt engine and initialize seq2seq model.

**Currently you can find prebuilt engines only for next GPUs:**
* RTX 2080 Ti
* RTX 3080 Ti
* RTX 4090

**ONNX model and build script will be published later.**

In [2]:
from utils.engine import get_engine

path_to_engine = '/4tb_first/kalgin/projects/gptj_e2e_example/out_single_onnx/2prof_fq_gptj_i8f32.engine' #get_engine()
model = TrtSeq2SeqModel(path_to_engine=path_to_engine)

[03/31/2023-04:47:52] [TRT] [I] Loaded engine size: 11658 MiB
[03/31/2023-04:47:55] [TRT] [V] Trying to load shared library libcudnn.so.8
[03/31/2023-04:47:55] [TRT] [V] Loaded shared library libcudnn.so.8
[03/31/2023-04:47:55] [TRT] [V] Using cuDNN as plugin tactic source
[03/31/2023-04:47:56] [TRT] [V] Using cuDNN as core library tactic source
[03/31/2023-04:47:56] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1593, GPU +428, now: CPU 16489, GPU 12658 (MiB)
[03/31/2023-04:47:56] [TRT] [W] TensorRT was linked against cuDNN 8.6.0 but loaded cuDNN 8.3.2
[03/31/2023-04:47:56] [TRT] [V] Deserialization required 3605501 microseconds.
[03/31/2023-04:47:56] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +15, GPU +11619, now: CPU 15, GPU 11619 (MiB)
[03/31/2023-04:47:56] [TRT] [V] Trying to load shared library libcudnn.so.8
[03/31/2023-04:47:56] [TRT] [V] Loaded shared library libcudnn.so.8
[03/31/2023-04:47:56] [TRT] [V] Using cuDNN as plugin tactic sourc

## Seq2seq example.

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B')

input_text = 'Hello world!'

input_ids = tokenizer(input_text, return_tensors='pt')
input_ids = input_ids['input_ids'].to(device='cuda', dtype=torch.int32)
generated_ids = model.generate(input_ids, generate_len=100)
(generated_ids,) = generated_ids
generated_text = tokenizer.decode(generated_ids)

print(input_text + generated_text)

TypeError: set_optimization_profile_async(): incompatible function arguments. The following argument types are supported:
    1. (self: tensorrt.tensorrt.IExecutionContext, profile_index: int, stream_handle: int) -> bool

Invoked with: <tensorrt.tensorrt.IExecutionContext object at 0x7fe0fe7cf530>, 1

## Accuracy validation.

In [5]:
from utils.test import test_acc


def predict_next_id(input_ids: torch.Tensor) -> torch.Tensor:
    input_ids = input_ids.to(device='cuda', dtype=torch.int32)
    result = model.generate(input_ids, generate_len=1)
    result = result.detach().cpu()
    
    return result


test_acc(predict_next_id, verbose=True);

Found cached dataset lambada (/home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373)
Loading cached shuffled indices for dataset at /home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373/cache-9e3acb0e95629ff1.arrow
Loading cached processed dataset at /home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373/cache-998a74ae77022345.arrow
Acc = 78.85%: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4869/4869 [02:37<00:00, 30.89it/s]


Final acc = 78.85%


In [1]:
from transformers import AutoModelForCausalLM
from utils.test import test_acc
import torch

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path='EleutherAI/gpt-j-6B',
    revision='main',
    low_cpu_mem_usage=True,
).eval()
model.config.pad_token_id = model.config.eos_token_id
model = model.to('cuda') #, dtype=torch.float16)

def predict_next_id(input_ids: torch.Tensor) -> torch.Tensor:
    max_length = input_ids.shape[1] + 1
    with torch.no_grad():
        output_ids = model.generate(input_ids.cuda(), max_length=max_length, do_sample=False)
        (output_ids,) = output_ids
        
    return output_ids[-1].detach().cpu()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_acc(predict_next_id, verbose=True)

Found cached dataset lambada (/home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373)
Loading cached shuffled indices for dataset at /home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373/cache-9e3acb0e95629ff1.arrow
Loading cached processed dataset at /home/igor_kalgin/.cache/huggingface/datasets/lambada/plain_text/1.1.0/9f7bada20233bfec7d1d888d179c81442d504fb3d0dd97cddeba020b19924373/cache-998a74ae77022345.arrow
  0%|                                                                                                                                                                            | 0/4869 [00:01<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
# 79.17