In [6]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

model_id="yoshitomo-matsubara/bert-base-uncased-sst2"
model = ORTModelForSequenceClassification.from_pretrained(".",file_name="quantize_model.onnx")
tokenizer = AutoTokenizer.from_pretrained(model_id)

The ONNX file quantize_model.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [7]:
from transformers import pipeline

clf = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [8]:
clf("I hate you")

[{'label': 'LABEL_0', 'score': 0.9988948702812195}]

In [9]:
clf("I like you")

[{'label': 'LABEL_1', 'score': 0.9995238780975342}]

In [22]:
from time import perf_counter
import numpy as np

payload="Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend "*2
print(f'Payload sequence length: {len(tokenizer(payload)["input_ids"])}')

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(300):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms


Payload sequence length: 128


In [23]:
vanilla_clx = pipeline("text-classification",model=model_id)

vanilla_model = measure_latency(vanilla_clx)
# quantized_model = measure_latency(clf)

In [15]:
print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")


Vanilla model: P95 latency (ms) - 252.65053809989695; Average latency (ms) - 215.75 +\- 27.86;
Quantized model: P95 latency (ms) - 146.056037150629; Average latency (ms) - 139.34 +\- 17.53;
Improvement through quantization: 1.73x


## Dynamic quantization optimization

In [24]:
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig
from transformers import AutoTokenizer

model_id="yoshitomo-matsubara/bert-base-uncased-sst2"
onnx_path="dyn_test"

ort_model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)


# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(ort_model)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.optimize(save_dir=onnx_path, optimization_config=optimization_config)

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(onnx_path,file_name="model_optimized.onnx")
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)

# load the optimized quantized model
opt_model = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name="model_optimized_quantized.onnx")


2023-01-31 16:39:07.855500836 [W:onnxruntime:, inference_session.cc:1458 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.
The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [25]:
from transformers import pipeline

dyn_q = pipeline("sentiment-analysis", model=opt_model, tokenizer=tokenizer)


quantized_model = measure_latency(dyn_q)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")

Vanilla model: P95 latency (ms) - 316.55501350051054; Average latency (ms) - 308.13 +\- 21.29;
Quantized model: P95 latency (ms) - 139.15134559983926; Average latency (ms) - 137.75 +\- 0.80;
Improvement through quantization: 2.27x
