In [46]:
import time
import numpy as np
import spacy
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import DataLoader, Dataset, random_split
import torch.optim as optim
import torch.nn.functional as fn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from pos_encoding import PositionalEmbedding
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import math
from timeit import default_timer as timer

In [47]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
!docker -v

Docker version 20.10.24, build 297e128


In [49]:
!python --version

Python 3.9.16


In [50]:
!conda --version

conda 22.11.1


In [None]:
# !docker pull nvcr.io/nvidia/tritonserver:23.01-py3

In [51]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, dim_ffd, num_classes, dropout=0.1):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dim_ffd = dim_ffd
        self.num_classes = num_classes
        self.dropout = dropout

        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.positional_embed = PositionalEmbedding(max_len=2000, embed_size=self.embed_size)
#         nn.init.trunc_normal_(self.positional_embed, std=0.2)
        assert self.embed_size % self.num_heads == 0, "number of heads must divide evenly into embedding size"

        self.layer = nn.TransformerEncoderLayer(d_model=self.embed_size, nhead=self.num_heads, dim_feedforward=self.dim_ffd, dropout=self.dropout)
        self.encoder = nn.TransformerEncoder(self.layer, num_layers=self.num_layers)

        self.fc = nn.Linear(self.embed_size, self.num_classes)

    def forward(self, x):
        x = self.positional_embed(self.embedding(x))
        x = self.encoder(x)
        x = x.mean(dim=0)
        out = self.fc(x)

        return out

In [4]:
vocab = torch.load('vocab.pth')

In [5]:
VOCAB_SIZE = len(vocab)
EMB_SIZE = 512
N_HEADS = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 6
NUM_CLASSES = 4
EPOCHS = 10
PAD, UNK = 0, 1

In [55]:
model = Encoder(vocab_size=VOCAB_SIZE, embed_size=EMB_SIZE, num_heads=N_HEADS, num_layers=NUM_ENCODER_LAYERS, dim_ffd=FFN_HID_DIM, num_classes=NUM_CLASSES)
# model = model.to(DEVICE)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
# we'll create the model directory structure for each of our PyTorch 
import os
directory = 'models/transformer-encoder/1'
os.makedirs(directory, exist_ok=True)

In [56]:
def load_checkpoint(checkpoint):
    print("Loading checkpoint.....")
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

In [57]:
load_checkpoint(torch.load('my_model.pth.tar'))

Loading checkpoint.....


In [58]:
print(model)

Encoder(
  (embedding): Embedding(110934, 512)
  (positional_embed): PositionalEmbedding()
  (layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=512, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
     

In [59]:
model = model.eval().cuda()

In [None]:
torch.save(model, 'model.pt')

In [36]:
config = """
    name: "transformer-encoder"
    platform: "pytorch"
    max_batch_size: 32
    input[
        {
            name:'input_0'
            data_type: TYPE_FP32
            dims: [512]
        }
    ]
    output{
        name: "output_0"
        data_type: TYPE_FP32
        dims: [ 4 ]
    }
"""

In [37]:
with open('models/transformer-encoder/config.pbtxt', 'w') as file:
    file.write(config)

In [None]:
# !docker run --gpus=1 --rm -p8000:8000 -p8001:8001 -p8002:8002 -v/full/path/to/docs/examples/model_repository:/models nvcr.io/nvidia/tritonserver:23.01-py3 tritonserver --model-repository=/models

In [38]:
!curl -v localhost:8000/v2/health/ready

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0*   Trying 127.0.0.1:8000...
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/8.0.1

> Accept: */*

> 

< HTTP/1.1 200 OK

< Content-Length: 0

< Content-Type: text/plain

< 


  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
* Connection #0 to host localhost left intact


In [39]:
!curl -v localhost:8000/v2/models/transformer-encoder

{"error":"Request for unknown model: 'transformer-encoder' is not found"}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0*   Trying 127.0.0.1:8000...
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/models/transformer-encoder HTTP/1.1

> Host: localhost:8000

> User-Agent: curl/8.0.1

> Accept: */*

> 

< HTTP/1.1 400 Bad Request

< Content-Type: application/json

< Content-Length: 73

< 

{ [73 bytes data]

100    73  100    73    0     0  24737      0 --:--:-- --:--:-- --:--:-- 36500
* Connection #0 to host localhost left intact
