In [None]:
pip install torch torchvision torchaudio transformers Pillow psutil

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
import time
import psutil
import torch
from PIL import Image
from torchvision import transforms
from transformers import AutoImageProcessor, ResNetForImageClassification

# ----------------------------
# 1. Configuration
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "microsoft/resnet-50"
QUANTIZED_MODEL_PATH = "resnet50_quantized.pth"
IMAGE_PATH = "cat.jpg"

BATCH_SIZE = 1
NUM_RUNS = 20

# ----------------------------
# 2. Load Processor & FP32 Model
# ----------------------------
print("Loading FP32 model and processor...")
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
model_fp32 = ResNetForImageClassification.from_pretrained(MODEL_NAME).to(DEVICE).eval()

# ----------------------------
# 3. Preprocess Image
# ----------------------------
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    return inputs["pixel_values"].to(DEVICE)

# ----------------------------
# 4. Quantization Function
# ----------------------------
def quantize_model(model_fp32, quantized_model_path):
    if not torch.backends.quantized.supported_engines:
        print("Quantized backend not supported on this machine.")
        return None

    model_fp32.eval()
    model_int8 = torch.quantization.quantize_dynamic(
        model_fp32, {torch.nn.Linear}, dtype=torch.qint8
    )
    torch.save(model_int8.state_dict(), quantized_model_path)
    print(f"Quantized model saved to: {quantized_model_path}")
    return model_int8

# ----------------------------
# 5. Load Quantized Model
# ----------------------------
def load_quantized_model(model_name, quantized_model_path):
    processor = AutoImageProcessor.from_pretrained(model_name)
    model_int8 = ResNetForImageClassification.from_pretrained(model_name)
    model_int8.load_state_dict(torch.load(quantized_model_path, map_location=DEVICE))
    model_int8.to(DEVICE).eval()
    return processor, model_int8

# ----------------------------
# 6. Benchmark Function
# ----------------------------
def benchmark(model, input_tensor, runs=NUM_RUNS):
    # Timing
    torch.cuda.synchronize() if DEVICE.type == "cuda" else None
    start_time = time.time()
    with torch.no_grad():
        for _ in range(runs):
            _ = model(input_tensor)
    torch.cuda.synchronize() if DEVICE.type == "cuda" else None
    end_time = time.time()

    latency = ((end_time - start_time) / runs) * 1000  # ms
    throughput = (runs * BATCH_SIZE) / (end_time - start_time)

    # Memory
    if DEVICE.type == "cuda":
        allocated = torch.cuda.memory_allocated(DEVICE) / (1024 ** 2)
        reserved = torch.cuda.memory_reserved(DEVICE) / (1024 ** 2)
    else:
        allocated = reserved = 0.0
    cpu_mem = psutil.Process().memory_info().rss / (1024 ** 2)

    return latency, throughput, allocated, reserved, cpu_mem

# ----------------------------
# 7. Run Inference & Benchmark
# ----------------------------
if __name__ == "__main__":
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Put a test image at: {IMAGE_PATH}")

    # Quantize the model if the quantized version doesn't exist
    if not os.path.exists(QUANTIZED_MODEL_PATH):
        print("\nQuantizing the FP32 model...")
        model_int8 = quantize_model(model_fp32, QUANTIZED_MODEL_PATH)
        if model_int8 is None:
            exit()
        del model_fp32  # Free up memory
        processor_quantized, model_quantized = processor, model_int8
    else:
        print(f"\nLoading the quantized model from: {QUANTIZED_MODEL_PATH}")
        processor_quantized, model_quantized = load_quantized_model(MODEL_NAME, QUANTIZED_MODEL_PATH)

    print(f"\nRunning inference on device: {DEVICE} (Quantized Model)\n")
    pixel_values = load_image(IMAGE_PATH)

    # Inference with quantized model
    with torch.no_grad():
        outputs_quantized = model_quantized(pixel_values)
        predicted_label_quantized = outputs_quantized.logits.argmax(-1).item()
        label_quantized = model_quantized.config.id2label[predicted_label_quantized]

    # Benchmark quantized model
    latency_quantized, throughput_quantized, alloc_q, resv_q, cpu_q = benchmark(model_quantized, pixel_values)

    # ----------------------------
    # 8. Output Format
    # ----------------------------
    print("Input Image:", IMAGE_PATH)
    print("\nGenerated Output (Quantized Model):")
    print(f"Predicted class: {label_quantized}\n")

    print("ResNet‑50 Hugging Face (Quantized):")
    print(f"  • Latency per image: {latency_quantized:.2f} ms")
    print(f"  • Throughput:        {throughput_quantized:.2f} images/sec")
    if DEVICE.type == "cuda":
        print(f"  • Allocated GPU Memory: {alloc_q:.2f} MB")
        print(f"  • Reserved GPU Memory:  {resv_q:.2f} MB")
    print(f"  • CPU Memory Usage:     {cpu_q:.2f} MB")

Loading FP32 model and processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]


Quantizing the FP32 model...
Quantized model saved to: resnet50_quantized.pth

Running inference on device: cpu (Quantized Model)

Input Image: cat.jpg

Generated Output (Quantized Model):
Predicted class: tabby, tabby cat

ResNet‑50 Hugging Face (Quantized):
  • Latency per image: 212.46 ms
  • Throughput:        4.71 images/sec
  • CPU Memory Usage:     1396.50 MB
