# QAT PROJECT: SST-2 DATASET

This file aims to load a model trained with pytorch and convert it to onnx format, the final objective is to apply QAT and observe the trade-offs between the baseline model and the optimized model.

In [1]:
from config import FINE_TUNED_MODEL_SAVE_PATH
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_SAVE_PATH)

print("Model loaded successfully from:", FINE_TUNED_MODEL_SAVE_PATH)

Model loaded successfully from: ./fine_tuned_baseline_model


In [2]:
from src.evaluate_models import evaluate_pytorch_model
from src.data_preparation import load_and_preprocess_data, get_subsetted_datasets
from config import TOKENIZED_DATASET_SAVE_PATH, TOKENIZER_SAVE_PATH, PER_DEVICE_EVAL_BATCH_SIZE, SUBSET_SIZE, NUM_PROCESSES_FOR_MAP, MAX_SEQUENCE_LENGTH, MODEL_NAME

sst2_ds, tokenized_ds, parent_tokenizer = load_and_preprocess_data(
    model_name=MODEL_NAME,
    tokenizer_save_path=TOKENIZER_SAVE_PATH,
    tokenized_dataset_save_path=TOKENIZED_DATASET_SAVE_PATH,
    max_length=MAX_SEQUENCE_LENGTH,
    num_processes_for_map=NUM_PROCESSES_FOR_MAP
)

tok_train_ds, tok_val_ds = get_subsetted_datasets(
    tokenized_ds=tokenized_ds,
    train_subset_size=SUBSET_SIZE,
)

Loading SST-2 dataset...
Loading tokenizer from local path: ./distilbert_tokenizer_local
Loading tokenized dataset from: ./SST2_tokenized_dataset

Using full train dataset for training.
Final subset sizes: Train=67349, Eval=872


In [3]:
print("\nStarting evaluation of the baseline model...")
evaluate_pytorch_model(
    model_path=FINE_TUNED_MODEL_SAVE_PATH,
    eval_dataset=tok_val_ds,
    batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    tokenizer=parent_tokenizer
)
print("Baseline model evaluation complete!")


Starting evaluation of the baseline model...


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

PyTorch Model Accuracy: 0.9083
Average Inference Time per Batch: 0.0360 seconds
Model Size: 255.43 MB
Baseline model evaluation complete!


In [4]:
from optimum.exporters.onnx import main_export

def export_to_onnx(model_name_or_path, output, tokenizer, task="sequence-classification", opset=17):
    """
    Export a model to ONNX format.
    
    Args:
        model_name_or_path (str): Path to the model.
        output (str): Output directory for the ONNX model.
        task (str): Task type for the model.
        tokenizer: Tokenizer used for the model.
        opset (int): ONNX opset version.
    """
    main_export(
        model_name_or_path=model_name_or_path,
        output=output,
        task=task,
        tokenizer=tokenizer,
        opset=opset
    )



In [None]:
import torch
from torch.quantization import (
    QConfig,
    FakeQuantize,
    PerChannelMinMaxObserver,
    MovingAverageMinMaxObserver
)

model.train()

# Exclude embeddings from quantization
model.distilbert.embeddings.qconfig = None

custom_qconfig = QConfig(
    activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, dtype=torch.quint8,
                                     qscheme=torch.per_tensor_affine),
    weight=FakeQuantize.with_args(observer=PerChannelMinMaxObserver, quant_min=-128, quant_max=127, dtype=torch.qint8,
                                 qscheme=torch.per_channel_symmetric)
)

model.qconfig = custom_qconfig

# qat_model = prepare_qat_fx(model)
qat_model = torch.quantization.prepare_qat(model, inplace=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
qat_model.to(device)

# Set the model to training mode for the fine-tuning step.
qat_model.train()

print("PyTorch model prepared for Quantization-Aware Training.")

PyTorch model prepared for Quantization-Aware Training.


In [6]:
from transformers import TrainingArguments, Trainer
from src.utils import compute_metrics

num_qat_epochs = 1
qat_output_dir = "./qat_finetuning_output"
qat_learning_rate = 2e-5
per_device_train_batch_size = 8
per_device_eval_batch_size = 8

print(f"\nStarting QAT fine-tuning for {num_qat_epochs} epochs...")

qat_training_args = TrainingArguments(
    output_dir=qat_output_dir,
    num_train_epochs=num_qat_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    learning_rate=qat_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=f"{qat_output_dir}/logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
)

qat_trainer = Trainer(
    model=qat_model,
    args=qat_training_args,
    train_dataset=tok_train_ds,
    eval_dataset=tok_val_ds,
    tokenizer=parent_tokenizer,
    compute_metrics=compute_metrics,
)
    
qat_trainer.train()
print("QAT fine-tuning complete.")


Starting QAT fine-tuning for 1 epochs...


  qat_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.08,0.600382,0.899083


QAT fine-tuning complete.


In [9]:
from torch.quantization.quantize_fx import convert_fx

qat_model.eval()
qat_model.cpu()
fused_quantized_model = convert_fx(qat_model)

print("\nPyTorch model converted to a fused, quantized state.")

ValueError: input model must be a GraphModule, Got type:<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> Please make sure to follow the tutorials.