In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.pipelines import pipeline

# Load data

Firstly, we load data to test our models

In [2]:
from datasets import load_dataset

dataset = load_dataset('samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [3]:
dataset['validation'].shuffle()
dataset['validation'] = dataset['validation'].shard(8, 0)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 103
    })
})

# Utils

Here we have functions to measure the running time of model

In [4]:
import time
def model_time_evaluation(model, tokenizer, data):
    start_time = time.perf_counter()
    for text in data:
        inputs = tokenizer(text=text['dialogue'], text_target=text['summary'], return_tensors="pt", max_length=512, padding='max_length',
                           truncation=True)
        inputs['decoder_input_ids'] = inputs['labels']
        del inputs['labels']

        outputs = model(**inputs)
    end_time = time.perf_counter()
    return end_time - start_time

# Simple ONNX

Converting `google/pegasus-xsum` transformer to onnx format using `optimum-cli` command and save it to `./model/pegasus/`

In [5]:
!optimum-cli export onnx --model google/pegasus-xsum --task summarization ./model/pegasus/

Framework not specified. Using pt to export to ONNX.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The task `text2text-generation` was manually specified, and past key values will not be reused in the decoding. if needed, please pass `--task text2text-generation-with-past` to export using the past key values.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.1.0
Overriding 1 configuration item(s)
	- use_cache -> False
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
Using frame

Then load onnx model and its tokenizer to make some tests.

In [6]:
tokenizer = AutoTokenizer.from_pretrained("./model/pegasus/")
model = ORTModelForSeq2SeqLM.from_pretrained("./model/pegasus/", use_cache=False)

## Forward propagation test

Firstly we measure how much time is needed for forward propagation for ONNX pegasus model with max-size input.

In [7]:
model_time_evaluation(model, tokenizer, dataset['validation'])

148.8189069

As we see above, it takes 160 seconds for ONNX model to run 103 inputs.

## Pipeline tests

Now we build the summarization pipeline to test how much time it take to generate a summary for the given text:

In [8]:
pipe = pipeline('summarization', model=model, tokenizer=tokenizer)

In [9]:
len(tokenizer.encode(dataset['validation']['dialogue'][0]))

244

In [10]:
start_time = time.perf_counter()
print(pipe(dataset['validation']['dialogue'][0]))
end_time = time.perf_counter()
end_time - start_time

[{'summary_text': 'The following is a transcript of a phone call between Tom and his son, Lemmy.'}]


15.870781799999804

In [11]:
len(tokenizer.encode(dataset['validation']['dialogue'][1]))

154

In [12]:
start_time = time.perf_counter()
print(pipe(dataset['validation']['dialogue'][1]))
end_time = time.perf_counter()
end_time - start_time

[{'summary_text': 'A conversation between a waiter and a customer in a restaurant.'}]


6.953937300000234

In conclusion we can see ONNX model has good performance on given texts. It took 15.8 seconds for the first text and 7 for the second.

# ONNX with quantization

Now we quantize our models with `quantize_dynamic`. Also, it should be mentioned that we copy [`config.json`, `generation_config.json`, `special_tokens_map.json`, `spiece.model`, `tokenizer.json`, `tokenizer_config.json`] from `./model/pegasus/` to `./model/q_pegasus/`, because we change only models (encoder, decoder), but we don't use all other utilities.

In [13]:
from onnxruntime.quantization import quantize_dynamic

quantize_dynamic("./model/pegasus/decoder_model.onnx", "./model/q_pegasus/decoder_model.onnx")
quantize_dynamic("./model/pegasus/encoder_model.onnx", "./model/q_pegasus/encoder_model.onnx")



Ignore MatMul due to non constant B: /[/model/decoder/layers.0/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.0/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/decoder/layers.0/encoder_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.0/encoder_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/decoder/layers.1/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.1/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/decoder/layers.1/encoder_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.1/encoder_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/decoder/layers.2/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.2/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/decoder/layers.2/encoder_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.2/encoder_attn/MatMul_1



Ignore MatMul due to non constant B: /[/layers.0/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.0/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.1/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.1/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.2/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.2/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.3/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.3/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.4/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.4/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.5/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.5/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.6/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/layers.6/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/layers.7

Then we load our quantized model:

In [14]:
tokenizer = AutoTokenizer.from_pretrained("./model/q_pegasus/")
model = ORTModelForSeq2SeqLM.from_pretrained("./model/q_pegasus/", use_cache=False)

We are performing same test actions in all cells below but with quantized model. 

## Forward propagation test

In [15]:
model_time_evaluation(model, tokenizer, dataset['validation'])

101.02335279999988

As we can see inference time of quantized model is much lower than of ONNX model.

## Pipeline tests

In [16]:
pipe = pipeline('summarization', model=model, tokenizer=tokenizer)

In [17]:
start_time = time.perf_counter()
print(pipe(dataset['validation']['dialogue'][0]))
end_time = time.perf_counter()
end_time - start_time

[{'summary_text': "Here's a letter from Lemmy to his father, Tom, explaining why he wants to buy his son a puppy."}]


11.485970299999735

In [18]:
start_time = time.perf_counter()
print(pipe(dataset['validation']['dialogue'][1]))
end_time = time.perf_counter()
end_time - start_time

[{'summary_text': 'A conversation between a waiter and a customer in a restaurant.'}]


5.546223100000134

In conclusion, we can that quantized model generating pipeline with same level of performance is almost 1.5 faster.

# Model deployment

In `fast-api-service` folder we have a fastapi service that implements API to access our summarization model.

Below we build a docker image of fastapi service and run the container.

In [19]:
!cd fast-api-service && docker-compose build && docker-compose up -d

#0 building with "default" instance using docker driver

#1 [transformer-model internal] load .dockerignore
#1 transferring context: 2B done
#1 DONE 0.0s

#2 [transformer-model internal] load build definition from Dockerfile
#2 transferring dockerfile: 560B done
#2 DONE 0.0s

#3 [transformer-model internal] load metadata for docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.10
#3 ...

#4 [transformer-model auth] tiangolo/uvicorn-gunicorn-fastapi:pull token for registry-1.docker.io
#4 DONE 0.0s

#3 [transformer-model internal] load metadata for docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.10
#3 DONE 2.1s

#5 [transformer-model 1/6] FROM docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.10@sha256:8b237d01ded686ca6c2910af28302cc1997fac8adc69a10cddae603d05c4f485
#5 DONE 0.0s

#6 [transformer-model internal] load build context
#6 transferring context: 692B done
#6 DONE 0.0s

#7 [transformer-model 2/6] RUN curl -sSL https://install.python-poetry.org | python - --version 1.2.0 &&  

 Container transformer-service-latest  Created
 Container transformer-service-latest  Starting
 Container transformer-service-latest  Started


Now we will check the operation of the API.

In [21]:
import requests

requests.post(
    'http://127.0.0.1:8088/summarize',
    data="Giuseppe: Hi man Matteo: Yo Giuseppe: How's it going with Gosia? Matteo: I don't know, she's a little strange Giuseppe: Why? Matteo: She always criticizes me because I like football and video games Giuseppe: Damn Matteo: Yeah... Giuseppe: Ok, I don't like games either, but... Matteo: You boring guy Giuseppe: Lol Matteo: Anyway I like her a lot Giuseppe: I can understand that, she's hot, if you ever dump her make sure you tell me Matteo: Get your hands off her, man Giuseppe: Just kidding Matteo: Lollolol"
    ).text

'{"summary":"On this week\'s episode of Corriere dello Sport, we take a look at the relationship between Matteo and Gosia."}'

In [None]:
requests.post(
    'http://127.0.0.1:8088/summarize',
    data=dataset['validation']['dialogue'][0].encode('utf-8')
).text

As we see it gives responses to our requests with expected result.