# QAT PROJECT: SST-2 DATASET

This file aims to load a model trained with pytorch and convert it to onnx format, the final objective is to apply QAT and observe the trade-offs between the baseline model and the optimized model.

In [22]:
from config import (
    FINE_TUNED_MODEL_SAVE_PATH,
    TOKENIZED_DATASET_SAVE_PATH, 
    TOKENIZER_SAVE_PATH, 
    PER_DEVICE_EVAL_BATCH_SIZE, 
    PER_DEVICE_TRAIN_BATCH_SIZE,
    SUBSET_SIZE, 
    NUM_PROCESSES_FOR_MAP, 
    MAX_SEQUENCE_LENGTH, 
    MODEL_NAME,
    QUANTIZED_QAT_MODEL_SAVE_PATH,
    #ONNX_MODEL_SAVE_PATH
)
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


SUBSET_SIZE = -1

Using device: cpu


In [4]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_SAVE_PATH)

print("Model loaded successfully from:", FINE_TUNED_MODEL_SAVE_PATH)

Model loaded successfully from: ./fine_tuned_baseline_model


## TOKENIZATION & DATA

In [23]:
from src.data_preparation import load_and_preprocess_data, get_subsetted_datasets

sst2_ds, tokenized_ds, parent_tokenizer = load_and_preprocess_data(
    model_name=MODEL_NAME,
    tokenizer_save_path=TOKENIZER_SAVE_PATH,
    tokenized_dataset_save_path=TOKENIZED_DATASET_SAVE_PATH,
    max_length=MAX_SEQUENCE_LENGTH,
    num_processes_for_map=NUM_PROCESSES_FOR_MAP
)

tok_train_ds, tok_val_ds = get_subsetted_datasets(
    tokenized_ds=tokenized_ds,
    train_subset_size=SUBSET_SIZE,
)

Loading SST-2 dataset...
Loading tokenizer from local path: ./distilbert_tokenizer_local
Loading tokenized dataset from: ./SST2_tokenized_dataset

Using full train dataset for training.
Final subset sizes: Train=67349, Eval=872


## Evaluating the fine-tuned model

In [6]:
from src.evaluate_models import evaluate_pytorch_model

print("\nStarting evaluation of the baseline model...")
evaluate_pytorch_model(
    model_path=FINE_TUNED_MODEL_SAVE_PATH,
    eval_dataset=tok_val_ds,
    batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    tokenizer=parent_tokenizer
)
print("Baseline model evaluation complete!")



Starting evaluation of the baseline model...
Evaluation device: cpu


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

PyTorch Model Accuracy: 0.9000
Average Inference Time per Batch: 0.7606 seconds
Model Size: 255.43 MB
Baseline model evaluation complete!


## Apply Dynamic Quantization

In [7]:
from torch.ao.quantization import quantize_dynamic

model.to("cpu")
model.eval()

layers_to_quantize = {torch.nn.Linear}
quantization_dtype = torch.qint8

# Apply dynamic quantization
model_quantized = torch.quantization.quantize_dynamic(
    model=model,
    qconfig_spec=layers_to_quantize,
    dtype=quantization_dtype
)

print("\nDynamically Quantized Model:")
print(model_quantized)


Dynamically Quantized Model:
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_lin): DynamicQuantizedLine

## Saving and loading the quantized model

In [16]:
import torch
import os
from transformers import AutoTokenizer

qat_model_save_path = "./quantized_pytorch_model"
if not os.path.exists(qat_model_save_path):
    os.makedirs(qat_model_save_path)

torch.save(model_quantized, os.path.join(qat_model_save_path, "pytorch_model.bin"))
print(f"Quantized model object saved to: {os.path.join(qat_model_save_path, 'pytorch_model.bin')}")

# --- Save model config (from original model) and tokenizer (from original model path) ---
model.config.save_pretrained(qat_model_save_path)
tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_SAVE_PATH) 
tokenizer.save_pretrained(qat_model_save_path)
print("Quantized model config and tokenizer saved.")

Quantized model object saved to: ./quantized_pytorch_model\pytorch_model.bin
Quantized model config and tokenizer saved.


## Export the PyTorch model to ONNX format

In [18]:
from optimum.onnxruntime.configuration import QuantizationConfig, QuantFormat, QuantType

model_quantized.eval()

quantization_config_onnx = QuantizationConfig(
    is_static=True,
    per_channel=True,
    format=QuantFormat.QDQ,
    operators_to_quantize=["MatMul", "Gemm"],
    weights_symmetric=True,
    activations_symmetric=False,
    weights_dtype=QuantType.QInt8,
    activations_dtype=QuantType.QUInt8,
)

In [20]:
from optimum.exporters.onnx import main_export

# Set the output directory for the final ONNX model
output_onnx_dir = "./onnx_models_quantized"
os.makedirs(output_onnx_dir, exist_ok=True)
output_path = os.path.join(output_onnx_dir, "model.onnx")

main_export(
    # Pass the path to the directory containing the FP32 model
    model_name_or_path=FINE_TUNED_MODEL_SAVE_PATH,
    output=output_onnx_dir,
    task="sequence-classification",
    tokenizer=parent_tokenizer,
    opset=17,
    # This tells the exporter to perform quantization during export
    quantization_config=quantization_config_onnx,
    framework='pt',
)

## Evaluate the onnx model

In [24]:
from src.evaluate_models import evaluate_onnx_model

onnx_metrics, onnx_inference_time, onnx_model_size = evaluate_onnx_model(
    onnx_model_path=output_onnx_dir + "/model.onnx",
    tokenizer=parent_tokenizer,
    eval_dataset=tok_val_ds,
    use_gpu=torch.cuda.is_available(),
    batch_size=PER_DEVICE_EVAL_BATCH_SIZE
)


Evaluating ONNX model...


Evaluating ONNX Model:   0%|          | 0/28 [00:00<?, ?it/s]

ONNX Model Accuracy on CPU: 0.9083
Average Inference Time per Batch on CPU: 1.4485 seconds
ONNX Model Size: 255.52 MB
