In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from torchao.quantization.qat.api import QATConfig
import copy
import time
import numpy as np

from torchao.quantization import (
    
    Int8WeightOnlyConfig,
    Int4WeightOnlyConfig,
    Int8DynamicActivationInt8WeightConfig,
    Int8DynamicActivationInt4WeightConfig,
    quantize_,
)
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.auto import tqdm
import warnings
import os
warnings.filterwarnings('ignore')

Skipping import of cpp extensions due to incompatible torch version 2.7.0+cu126 for torchao version 0.14.0         Please see GitHub issue #2919 for more info
W1102 18:17:51.898000 3872 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
# --- 1. Setup DataLoaders ---
# VGG models expect 224x224 images, CIFAR-100 is 32x32
# We need to resize, normalize (using ImageNet stats as VGG is pretrained on it)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Using a smaller batch size for demonstration
trainset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                        download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=32,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=32,
                                         shuffle=False, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [3]:
# --- 3. Load Pretrained VGG11 ---
# Load a VGG11 model pretrained on ImageNet
model_fp32 = torch.load("./models/vgg_73.pth", weights_only=False)
model_fp32 = model_fp32.module

model_fp32 = model_fp32.to(device)

In [37]:
total = 0
correct = 0
model_fp32.eval()
with torch.no_grad():
  for images, labels in tqdm(test_loader, desc="Evaluating", leave=False):
    images, labels = images.to(device), labels.to(device)
    outputs = model_fp32(images)
    _, predicted = torch.max(outputs, 1)
    total += len(labels)
    correct += (predicted == labels).sum().item()

print(f"Baseline Accuracy: {100 * correct / total:.2f}%")

Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

Baseline Accuracy: 71.16%


In [4]:
import torch.cuda.amp as amp # Make sure to import amp

def evaluate(model, data_loader, device):
    """
    Evaluates the accuracy of the model, handling fp32, fp16, bf16,
    and int8 quantized models.
    """
    model.to(device)
    model.eval()

    # --- Determine model's dtype for autocast ---
    try:
        model_dtype = next(model.parameters()).dtype
    except StopIteration:
        model_dtype = torch.float32 # Default for empty or fully quantized models

    # --- Check if it's a standard torch.quantization INT8 model ---
    # These models have specific modules and must run on CPU
    is_int8_quantized = any(
        isinstance(m, (
            torch.quantization.QuantStub,
            torch.nn.intrinsic.quantized.ConvReLU2d,
            torch.nn.quantized.Linear
            ))
        for m in model.modules()
    )

    if is_int8_quantized:
        # int8 models from torch.quantization are CPU-only
        device = torch.device("cpu")
        model.to(device)

    correct = 0
    total = 0
    with torch.no_grad():
        # Add tqdm for evaluation progress
        for images, labels in tqdm(data_loader, desc="Evaluating", leave=False):

            images, labels = images.to(device), labels.to(device)

            # --- FIX: Use autocast for fp16/bf16 on CUDA ---
            use_autocast = (model_dtype == torch.float16 or model_dtype == torch.bfloat16) and device.type == 'cuda'

            if use_autocast:
                with amp.autocast(dtype=model_dtype):
                    outputs = model(images)
            else:
                # This path handles fp32 and int8 (which takes fp32 input)
                outputs = model(images)
            # --- END FIX ---

            _, predicted = torch.max(outputs.data, 1)
            total += len(labels)
            correct += (predicted == labels).sum().item()

    return 100 * correct / total

In [5]:
def print_model_size(model, label=""):
    """Prints the size of the model's state_dict."""
    torch.save(model.state_dict(), "temp.p")
    size_mb = os.path.getsize("temp.p") / 1e6
    print(f"Size ({label}): {size_mb:.2f} MB")
    os.remove("temp.p")

In [6]:
num_epochs = 1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_fp32.parameters(), lr=0.001, momentum=0.9)

print(f"Starting {num_epochs}-epoch fine-tuning on CIFAR-100...")
start_time = time.time()

for epoch in range(num_epochs):
    model_fp32.train() # Set model to training mode
    running_loss = 0.0

    # Wrap train_loader with tqdm for a progress bar
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_fp32(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Update the progress bar description with the current loss
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    epoch_loss = running_loss / len(train_loader)
    epoch_time = time.time() - start_time

    # Print epoch results
    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {epoch_loss:.4f}, Time: {epoch_time:.2f}s")

print("Finished fine-tuning.")

Starting 1-epoch fine-tuning on CIFAR-100...


KeyboardInterrupt: 

In [43]:
baseline_accuracy = evaluate(model_fp32, test_loader, device)
print_model_size(model_fp32, "FP32")
print(f"Baseline FP32 Accuracy (untrained on CIFAR-100): {baseline_accuracy:.2f}%")

Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

Size (FP32): 516.71 MB
Baseline FP32 Accuracy (untrained on CIFAR-100): 68.88%


In [44]:
model_fp16 = copy.deepcopy(model_fp32).to(torch.float16)
fp16_accuracy = evaluate(model_fp16, test_loader, device)
print_model_size(model_fp16, "PTQ fp16")
print(f"PTQ fp16 Accuracy: {fp16_accuracy:.2f}%")

# --- PTQ bf16 ---
# Check for bf16 support
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    model_bf16 = copy.deepcopy(model_fp32).to(torch.bfloat16)
    bf16_accuracy = evaluate(model_bf16, test_loader, device)
    print_model_size(model_bf16, "PTQ bf16")
    print(f"PTQ bf16 Accuracy: {bf16_accuracy:.2f}%")
else:
    print("bf16 not supported on this hardware.")

Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

Size (PTQ fp16): 258.36 MB
PTQ fp16 Accuracy: 68.84%


Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

Size (PTQ bf16): 258.36 MB
PTQ bf16 Accuracy: 68.82%


In [50]:
# For INT8

model_int8_ptq = copy.deepcopy(model_fp32).to(torch.float32)
quantize_(model_int8_ptq, Int8WeightOnlyConfig())

accu_int8_ptq = evaluate(model_int8_ptq, test_loader, device)
print_model_size(model_int8_ptq, "PTQ int8")
print(f"PTQ int8 Accuracy: {accu_int8_ptq:.2f}%")

Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

Size (PTQ int8): 156.97 MB
PTQ int8 Accuracy: 68.86%


In [74]:
%pip install torchao



In [18]:
import copy
import torch
from torchao.quantization import quantize_, Int4WeightOnlyConfig

!pip install fbgemm-gpu fbgemm-gpu-genai
model_int4_ptq = copy.deepcopy(model_fp32).to(torch.float32)
quantize_(model_int4_ptq, Int4WeightOnlyConfig(group_size=128))

accu_int4_ptq = evaluate(model_int4_ptq, test_loader, device)
print_model_size(model_int4_ptq, "PTQ int4")
print(f"PTQ int4 Accuracy: {accu_int4_ptq:.2f}%")


ERROR: Could not find a version that satisfies the requirement fbgemm-gpu (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for fbgemm-gpu


ImportError: Requires fbgemm-gpu-genai >= 1.2.0

In [21]:
!pip install fbgemm-gpu fbgemm-gpu-genai

ERROR: Could not find a version that satisfies the requirement fbgemm-gpu (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for fbgemm-gpu


In [22]:
torch.version.__version__

'2.7.0+cu126'

In [23]:

!pip uninstall -y torchao
!pip install torchao==0.14.0
!pip install fbgemm-gpu fbgemm-gpu-genai

Found existing installation: torchao 0.13.0
Uninstalling torchao-0.13.0:
  Successfully uninstalled torchao-0.13.0
Collecting torchao==0.14.0
  Downloading torchao-0.14.0-py3-none-any.whl.metadata (19 kB)
Downloading torchao-0.14.0-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 8.7 MB/s eta 0:00:00
Installing collected packages: torchao
Successfully installed torchao-0.14.0



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement fbgemm-gpu (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for fbgemm-gpu


In [9]:
%pip install fbgemm-gpu fbgemm-gpu-genai

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement fbgemm-gpu (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for fbgemm-gpu


In [11]:
import matplotlib.pyplot as plt

# Data
formats = ["FP32", "FP16", "BF16", "INT8", "INT4"]
sizes = [516.71, 258.36, 258.36, 156.97, 164.37]
accuracies = [71.16, 71.14, 71.16, 71.24, 71.07]

# --- Figure 1: Accuracy ---
plt.figure(figsize=(6, 4))
plt.plot(formats, accuracies, marker='o', color='tab:blue', linewidth=2)
plt.title("Model Accuracy by Quantization Type")
plt.xlabel("Model Format")
plt.ylabel("Accuracy (%)")
plt.ylim(70.9, 71.3)
plt.grid(True)
plt.tight_layout()
plt.savefig("accuracy_plot.png", dpi=300)
plt.close()

# --- Figure 2: Model Size ---
plt.figure(figsize=(6, 4))
plt.bar(formats, sizes, color='tab:orange')
plt.title("Model Size by Quantization Type")
plt.xlabel("Model Format")
plt.ylabel("Size (MB)")
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("size_plot.png", dpi=300)
plt.close()

print("✅ Saved 'accuracy_plot.png' and 'size_plot.png'")


✅ Saved 'accuracy_plot.png' and 'size_plot.png'
