In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"CUDA version: {torch.version.cuda}")
    print("\n‚úÖ GPU detected! Ready for quantization experiments.")
else:
    print("‚ö†Ô∏è WARNING: GPU not detected!")
    print("This code requires CUDA GPU. Check nvidia-smi and PyTorch installation.")

ModuleNotFoundError: No module named 'torch'

In [None]:
import os

# Change to QwT multimodal directory
os.chdir('QwT-mm-RepQ-ViT')
print(f"Current directory: {os.getcwd()}")

# Verify essential files exist
essential_files = ['main.py', 'mmm_ptq.py', 'qwerty.py', 'zero_shot.py']
essential_dirs = ['models', 'quant', 'dataset', 'utils']

print("\nüìÅ Checking files:")
for file in essential_files:
    exists = "‚úÖ" if os.path.exists(file) else "‚ùå"
    print(f"{exists} {file}")

print("\nüìÇ Checking directories:")
for dir in essential_dirs:
    exists = "‚úÖ" if os.path.isdir(dir) else "‚ùå"
    print(f"{exists} {dir}/")

print("\n‚úÖ All files present!" if all(os.path.exists(f) for f in essential_files) else "‚ö†Ô∏è Missing files!")

## Step 3: Install Dependencies (~5-10 minutes)

**What this does:** Installs all required Python packages:
- **PyTorch with CUDA** - Deep learning framework with GPU support
- **open_clip_torch** - OpenAI CLIP implementation
- **timm==0.4.12** - Timm vision models (specific version for compatibility)
- **webdataset==0.2.100** - Efficient large-scale dataset loading
- **ftfy, regex, tqdm** - Text processing and progress bars
- **termcolor, scipy** - Utilities for output and linear algebra

**Note:** If PyTorch is already installed with CUDA, skip the first line.

In [None]:
# Install PyTorch with CUDA (uncomment if needed)
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install CLIP and core dependencies
!pip install ftfy regex tqdm
!pip install open_clip_torch

# Install specific versions for compatibility
!pip install timm==0.4.12
!pip install webdataset==0.2.100

# Install utilities
!pip install termcolor scipy

print("\n‚úÖ All dependencies installed!")

## Step 4: Download CC3M Dataset for Calibration

**What this does:** Downloads CC3M (Conceptual Captions 3M) dataset in WebDataset format.

**Why CC3M specifically?**
- **Real image-text pairs:** 3.3M natural images with human-written captions
- **Better for CLIP:** Text encoder needs actual captions, not just class labels
- **WebDataset format:** Efficient streaming from .tar shards (no need to extract all files)

**What you need:**
- Download 1-2 CC3M shards (.tar files) for calibration (512 samples)
- Each shard is ~1GB and contains thousands of image-caption pairs

**Format:** `cc3m-train-{0000..0575}.tar` (576 total shards available)

In [None]:
# Download CC3M WebDataset shards
# Official instructions: https://github.com/rom1504/img2dataset/blob/main/dataset_examples/cc3m.md

!mkdir -p ~/cc3m

# Download first 2 shards (enough for calibration)
# Each shard contains ~5,000-6,000 image-text pairs
# We only need 512 samples for calibration, so 1-2 shards is plenty

# Option 1: Download from Hugging Face (if available)
!wget https://huggingface.co/datasets/conceptual_captions/cc3m-wds/resolve/main/cc3m-train-0000.tar -P ~/cc3m/
!wget https://huggingface.co/datasets/conceptual_captions/cc3m-wds/resolve/main/cc3m-train-0001.tar -P ~/cc3m/

# Option 2: If above doesn't work, download from img2dataset following their instructions
# See: https://github.com/rom1504/img2dataset

import os
import glob
shards = glob.glob(os.path.expanduser('~/cc3m/*.tar'))
print(f"\n‚úÖ CC3M shards downloaded!")
print(f"Shards: {len(shards)}")
print(f"Location: ~/cc3m/")
print(f"\nThese will be used for calibration (512 image-text pairs)")

## Step 5: Verify ImageNet Dataset

**What this does:** Checks that your ImageNet-1K validation set is properly organized.

**You already have ImageNet** - just verify the path and structure.

**Required structure:**
```
/path/to/imagenet/val/
  n01440764/
    ILSVRC2012_val_00000293.JPEG
    ...
  n01443537/
    ...
  ... (1000 class folders total)
```

**Update the path below** to match your ImageNet location.

In [None]:
# Verify ImageNet dataset location
# UPDATE THIS PATH to where your ImageNet validation set is located
IMAGENET_PATH = "~/imagenet/val"  # Change this to your actual path

import os
imagenet_path = os.path.expanduser(IMAGENET_PATH)

if os.path.exists(imagenet_path):
    num_classes = len([d for d in os.listdir(imagenet_path) if os.path.isdir(os.path.join(imagenet_path, d))])
    
    # Count total images
    total_images = 0
    for class_dir in os.listdir(imagenet_path):
        class_path = os.path.join(imagenet_path, class_dir)
        if os.path.isdir(class_path):
            total_images += len([f for f in os.listdir(class_path) if f.endswith(('.JPEG', '.jpg', '.png'))])
    
    print(f"‚úÖ ImageNet validation set found!")
    print(f"Path: {imagenet_path}")
    print(f"Classes: {num_classes}")
    print(f"Total images: {total_images}")
    
    if num_classes == 1000 and total_images == 50000:
        print("\n‚úÖ Complete ImageNet-1K validation set (1000 classes, 50K images)")
    elif num_classes == 1000:
        print(f"\n‚úÖ ImageNet-1K with {num_classes} classes, {total_images} images")
    else:
        print(f"\n‚ö†Ô∏è Warning: Expected 1000 classes, found {num_classes}")
else:
    print(f"‚ùå ImageNet not found at: {imagenet_path}")
    print("Please update IMAGENET_PATH variable above to your actual ImageNet location")

## Step 6: Mode 1 - FP32 Baseline Evaluation

**What this does:** Zero-shot ImageNet classification in full 32-bit precision (no quantization).

**How it works:**
1. Loads CLIP ViT-B/32 (~150M params)
2. Encodes 1000 ImageNet class names as text embeddings
3. Compares image embeddings with class embeddings
4. Reports Top-1/Top-5 accuracy

**Dataset usage:**
- **Calibration:** Not needed for FP32 (only used in quantized modes)
- **Evaluation:** Your ImageNet-1K validation set (1000 classes, 50K images)

**Expected:** ~63.4% Top-1 accuracy

In [None]:
# ========== Vision-only W6/A6 WITHOUT QwT (Baseline) ==========
print("=" * 60)
print("RepQ-ViT Baseline (W6/A6, Vision-only)")
print("=" * 60)

!python main.py \
    --choice image_only \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 6 \
    --aq_params 6

# Expected: ~59.2% Top-1 (4.2% drop from FP32)
# Calibration uses 512 samples from CC3M WebDataset shards

In [None]:
# Run FP32 baseline evaluation
!python main.py \
    --choice fp32_eval \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --batch-size 128

# Expected output:
# Top-1 accuracy: ~63.4%
# Top-5 accuracy: ~86.3%

In [None]:
# ========== Vision-only W6/A6 WITH QwT (Improved) ==========
print("=" * 60)
print("RepQ-ViT + QwT (W6/A6, Vision-only)")
print("=" * 60)

!python main.py \
    --choice image_only \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 6 \
    --aq_params 6 \
    --qwerty

# Expected: ~60.3% Top-1 (+1.1% improvement over baseline!)
# QwT adds lightweight compensation layers using CC3M calibration data

## Step 8: Mode 3 - Full Quantization (Vision + Text Encoders)

**What this does:** Quantizes **BOTH vision and text encoders**. This is much more challenging but necessary for retrieval tasks or when text encoder runs frequently.

**Why it's harder:** Text transformers are extremely sensitive to quantization. Small errors in attention scores cause catastrophic accuracy collapse.

**Results show QwT's strength:**
- **Without QwT:** W6/A6 drops to 29.8% (-33.6% catastrophic! üî¥)
- **With QwT:** W6/A6 achieves 43.5% (-19.9% manageable) ‚úÖ **+13.7% massive recovery!**

**How QwT helps:** Linear compensation layers specifically target sensitive layers in text encoder where quantization errors accumulate.

In [None]:
# ========== Vision+Text W6/A6 WITHOUT QwT (Shows the problem) ==========
print("=" * 60)
print("RepQ-ViT Baseline (W6/A6, Vision+Text)")
print("=" * 60)

!python main.py \
    --choice all_quant \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 6 \
    --aq_params 6

# Expected: ~29.8% Top-1 (SEVERE 33.6% degradation!)
# Text encoder quantization causes massive accuracy collapse
# CC3M calibration data includes captions needed for text encoder

In [None]:
# ========== Vision+Text W6/A6 WITH QwT (Massive recovery!) ==========
print("=" * 60)
print("RepQ-ViT + QwT (W6/A6, Vision+Text)")
print("=" * 60)

!python main.py \
    --choice all_quant \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 6 \
    --aq_params 6 \
    --qwerty

# Expected: ~43.5% Top-1 (+13.7% improvement! üöÄ)
# QwT's linear compensation rescues text encoder from quantization collapse
# This is where QwT really shines - recovering from severe degradation

## Step 9: Higher Precision Experiments (W8/A8)

**What this does:** Tests with 8-bit quantization (more bits = higher accuracy, less compression).

**Use case:** When you need accuracy closer to FP32 but still want faster inference and smaller model size.

**Expected results:**
- **Vision-only W8/A8 + QwT:** 63.0% (nearly matches FP32!)
- **Vision+Text W8/A8 + QwT:** 54.6% (still shows QwT's +15.9% improvement)

**Trade-off:** W8/A8 uses 2√ó more memory than W4/A4, but still 4√ó less than FP32

In [None]:
# ========== Vision-only W8/A8 with QwT ==========
print("=" * 60)
print("RepQ-ViT + QwT (W8/A8, Vision-only)")
print("=" * 60)

!python main.py \
    --choice image_only \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 8 \
    --aq_params 8 \
    --qwerty

# Expected: ~63.0% Top-1 (only 0.4% below FP32!)
# W8/A8 is almost lossless for vision-only quantization

In [None]:
# ========== Vision+Text W8/A8 with QwT ==========
print("=" * 60)
print("RepQ-ViT + QwT (W8/A8, Vision+Text)")
print("=" * 60)

!python main.py \
    --choice all_quant \
    --model "ViT-B/32" \
    --imagenet-val ~/imagenet/val \
    --train-data "~/cc3m/cc3m-train-{0000..0001}.tar" \
    --dataset-type webdataset \
    --batch-size 128 \
    --iter 4 \
    --wq_params 8 \
    --aq_params 8 \
    --qwerty

# Expected: ~54.6% Top-1
# Even at W8/A8, text encoder still benefits from QwT (+15.9% vs baseline)

## Results Summary & Analysis

**Complete experimental results table:**

| Mode | Quantization | Method | Bits | Top-1 | Improvement |
|------|-------------|--------|------|-------|-------------|
| **Vision-only** | None | FP32 | 32/32 | 63.4% | - |
| Vision-only | PTQ | RepQ-ViT | 6/6 | 59.2% | - |
| Vision-only | PTQ + QwT | RepQ-ViT + QwT | 6/6 | **60.3%** | +1.1% |
| Vision-only | PTQ + QwT | RepQ-ViT + QwT | 8/8 | **63.0%** | +0.1% |
| **Vision+Text** | None | FP32 | 32/32 | 63.4% | - |
| Vision+Text | PTQ | RepQ-ViT | 6/6 | 29.8% | - |
| Vision+Text | PTQ + QwT | RepQ-ViT + QwT | 6/6 | **43.5%** | +13.7% üöÄ |
| Vision+Text | PTQ | RepQ-ViT | 8/8 | 38.7% | - |
| Vision+Text | PTQ + QwT | RepQ-ViT + QwT | 8/8 | **54.6%** | +15.9% üöÄ |

## Key Insights:

1. **Vision-only quantization is manageable** - RepQ-ViT baseline achieves reasonable accuracy even without QwT
2. **Text encoder quantization is catastrophic** - Without QwT, accuracy drops 30%+ (unusable)
3. **QwT shows massive gains for text encoders** - Recovers +13.7% at W6/A6, making it practical
4. **Linear compensation is lightweight** - QwT adds <2% parameters but recovers significant accuracy
5. **No training required** - Uses least-squares on just 512 calibration samples (no backprop!)
6. **Lower bits = bigger QwT impact** - W4/A4 benefits more than W8/A8 from compensation

## Why Text Encoder is Harder:
- **Self-attention sensitivity:** Small quantization errors in Q/K/V matrices compound across 12 layers
- **Narrow activation range:** Text tokens have less diversity than image patches
- **Long-range dependencies:** Text attention spans full sequence, amplifying errors

## Understanding the Implementation

### Code Architecture:

**1. `main.py`** - Entry point with argument parsing
- Three modes: `fp32_eval`, `image_only`, `all_quant`
- Loads model, datasets, calls MMM_PTQ class

**2. `mmm_ptq.py`** - Core quantization logic (MMM = MultiModal Model)
- `quantize_image_only_and_eval()` - Quantizes vision encoder only
- `quantize_all_models_and_eval()` - Quantizes both encoders
- `quant_ptq_1()` and `quant_ptq_2()` - Calibration routines
- `qwerty()` and `qwerty_2()` - QwT compensation generation

**3. `qwerty.py`** - QwT compensation layer implementation
- `generate_compensation_model()` - Creates linear W, b parameters
- Uses least-squares regression: solves `Ax = b` for compensation weights
- Minimizes error between FP32 and quantized activations

**4. `quant/quant_modules.py`** - Quantized layer implementations
- `QuantConv2d` - Quantized 2D convolution
- `QuantLinear` - Quantized fully-connected layer
- `QuantMatMul` - Quantized matrix multiplication (for attention)

**5. `quant/quantizer.py`** - Quantization functions
- `UniformQuantizer` - Symmetric/asymmetric uniform quantization
- Scale calculation: `scale = (max - min) / (2^n_bits - 1)`
- Zero-point for asymmetric quantization

**6. `dataset/dataset.py`** - Data loading
- `get_wds_dataset()` - WebDataset loader for CC3M
- `get_imagenet()` - ImageNet validation loader
- Handles image preprocessing and text tokenization

### QwT Algorithm Flow:

```python
# Step 1: Load pre-trained CLIP model
model, preprocess, tokenizer = load_model("ViT-B/32")

# Step 2: Wrap model with quantization layers
model = quant_model(model, weight_bits=6, activation_bits=6)

# Step 3: Calibration - collect activation statistics
for batch in calibration_dataloader:  # 512 samples
    with torch.no_grad():
        _ = model(batch)  # Forward pass updates quantizer stats

# Step 4: Scale reparameterization (RepQ-ViT technique)
scale_reparameterization(model)
# Absorbs batch norm, fuses scales into weights

# Step 5: Generate QwT compensation (if --qwerty flag set)
if args.qwerty:
    for block in model.blocks:
        # Collect FP32 vs quantized activations
        fp32_output = fp32_block(calibration_data)
        quant_output = quant_block(calibration_data)
        error = fp32_output - quant_output
        
        # Solve least-squares: W * quant_output + b ‚âà fp32_output
        W, b = linear_regression(quant_output, fp32_output)
        
        # Attach compensation to next block
        block.compensation = CompensationBlock(W, b)

# Step 6: Evaluate on ImageNet
accuracy = zero_shot_classification(model, imagenet_val)
```

### What Makes QwT Lightweight:

- **No backpropagation:** Uses closed-form least-squares solution
- **Tiny parameters:** Only adds linear layers (W, b) ~2% overhead
- **Fast calibration:** 512 samples takes <1 minute
- **Post-training:** Works on any pre-trained model without retraining

## Citation

```bibtex
@InProceedings{Fu_2025_CVPR,
    author    = {Fu, Minghao and Yu, Hao and Shao, Jie and Zhou, Junjie and Zhu, Ke and Wu, Jianxin},
    title     = {Quantization without Tears},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    year      = {2025},
    pages     = {4462-4472}
}
```