In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
!pip install optimum quanto onnxruntime onnxruntime-tools onnxconverter_common -q

## Loading finetuned model

In [4]:
import os
import onnx
import torch
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
from pathlib import Path
from transformers.onnx import FeaturesManager
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxconverter_common import float16

In [5]:
fine_tuned_checkpoint = "/content/drive/MyDrive/intent_classification/fine_tuned_distilled_bert"

In [6]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_checkpoint)

In [7]:
fine_tuned_pipeline = pipeline("text-classification", model=fine_tuned_model, tokenizer=tokenizer)

In [8]:
fine_tuned_pipeline("Hey, you up to play some games today?")

[{'label': 'play games', 'score': 0.9896244406700134}]

## Converting model to .onnx format

In [11]:
# load config
feature = "sequence-classification"
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(fine_tuned_model, feature=feature)
onnx_config = model_onnx_config(fine_tuned_model.config)

# export
onnx_inputs, onnx_outputs = transformers.onnx.export(
        preprocessor=tokenizer,
        model=fine_tuned_model,
        config=onnx_config,
        # output_fp16=True,
        opset=13,
        output=Path("/content/drive/MyDrive/intent_classification/onnx_model/trfs-model.onnx")
)

  mask, torch.tensor(torch.finfo(scores.dtype).min)


## Quantizing .onnx model to int8

In [12]:
onnx_model_path = "/content/drive/MyDrive/intent_classification/onnx_model/trfs-model.onnx"
quantized_model_path = "/content/drive/MyDrive/intent_classification/quantint_model/quantint_trfs-model.onnx"

In [13]:
quantize_dynamic(onnx_model_path,
                  quantized_model_path,
                  weight_type=QuantType.QInt8)

# Save the model configuration
fine_tuned_model.config.to_json_file("/content/drive/MyDrive/intent_classification/quantint_model/config.json")



## Inferencing 8-bit quantized model


In [17]:
quantint_checkpoint = "/content/drive/MyDrive/intent_classification/quantint_model"
tokenizer_checkpoint = "/content/drive/MyDrive/intent_classification/fine_tuned_distilled_bert"

In [18]:
quantint_model = ORTModelForSequenceClassification.from_pretrained(quantint_checkpoint)

The ONNX file quantint_trfs-model.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [19]:
quantint_pipeline = pipeline("text-classification", model=quantint_model, tokenizer=tokenizer_checkpoint)

In [20]:
quantint_pipeline("send a mail to my manager")

[{'label': 'send email', 'score': 0.9977303147315979}]

In [21]:
print('ONNX full precision model size (MB):', os.path.getsize(onnx_model_path)/(1024*1024))
print('ONNX quantized model size (MB):', os.path.getsize(quantized_model_path)/(1024*1024))

ONNX full precision model size (MB): 255.6012372970581
ONNX quantized model size (MB): 64.26772499084473


## FP-16 Quantization

In [23]:
model = onnx.load(onnx_model_path)
model_fp16 = float16.convert_float_to_float16(model)
onnx.save(model_fp16, "/content/drive/MyDrive/intent_classification/quantfloat_model/quantfloat_trfs-model.onnx")
fine_tuned_model.config.to_json_file("/content/drive/MyDrive/intent_classification/quantfloat_model/config.json")

In [24]:
fp16_quantized_model = "/content/drive/MyDrive/intent_classification/quantfloat_model/quantfloat_trfs-model.onnx"

In [25]:
fp16_quantized_checkpoint = "/content/drive/MyDrive/intent_classification/quantfloat_model"

## Inferencing fp16 quantized model

In [31]:
quantfloat_model = ORTModelForSequenceClassification.from_pretrained(fp16_quantized_checkpoint)

The ONNX file quantfloat_trfs-model.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [32]:
quantfloat_model = pipeline("text-classification", model=quantfloat_model, tokenizer=tokenizer_checkpoint)

In [33]:
quantfloat_model("send a mail to my manager")

[{'label': 'send email', 'score': 0.998046875}]

In [34]:
print('ONNX full precision model size (MB):', os.path.getsize(onnx_model_path)/(1024*1024))
print('ONNX quantized model size (MB):', os.path.getsize(fp16_quantized_model)/(1024*1024))

ONNX full precision model size (MB): 255.6012372970581
ONNX quantized model size (MB): 127.90854835510254


## Checking inference time b/w finetuned, quantized-float and quantized-int model

In [35]:
%%time
fine_tuned_pipeline("send a mail to my manager")

CPU times: user 82.4 ms, sys: 0 ns, total: 82.4 ms
Wall time: 118 ms


[{'label': 'send email', 'score': 0.9980485439300537}]

In [36]:
%%time
quantfloat_model("send a mail to my manager")

CPU times: user 170 ms, sys: 64.2 ms, total: 234 ms
Wall time: 252 ms


[{'label': 'send email', 'score': 0.998046875}]

In [37]:
%%time
quantint_pipeline("send a mail to my manager")

CPU times: user 16 ms, sys: 319 µs, total: 16.4 ms
Wall time: 43.6 ms


[{'label': 'send email', 'score': 0.9977303147315979}]