# DeepCrunch Quantization Demo ðŸš€

Welcome to DeepCrunch! This notebook demonstrates model quantization for different model types.

**What you'll learn:**
1. How to quantize different model types (FC, CNN, LSTM)
2. Compare original vs quantized models
3. Measure size reduction and speedup
4. Validate accuracy preservation

## Setup

In [None]:
import torch
import torch.nn as nn
import time
import numpy as np
from deepcrunch.backend.backend_registry import BackendRegistry

print(f"PyTorch version: {torch.__version__}")
print(f"DeepCrunch loaded successfully! âœ“")

## Example 1: Simple Fully Connected Model

Let's start with a basic fully connected network.

In [None]:
# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(100, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        return self.fc3(x)

# Create model
model = SimpleModel()
model.eval()

# Model stats
params = sum(p.numel() for p in model.parameters())
size_mb = sum(p.nelement() * p.element_size() for p in model.parameters()) / (1024 * 1024)

print(f"Model parameters: {params:,}")
print(f"Model size: {size_mb:.2f} MB")

In [None]:
# Create sample input
sample_input = torch.randn(1, 100)

# Original model inference
with torch.no_grad():
    original_output = model(sample_input)

print(f"Input shape: {sample_input.shape}")
print(f"Output shape: {original_output.shape}")
print(f"Sample output: {original_output[0, :5]}")

### Quantize the Model

In [None]:
# Quantize with DeepCrunch
backend = BackendRegistry.get_backend("torch")
backend.model = model

quantized_model = backend.quantize(
    type="dynamic",
    dtype="qint8"
)

print("âœ“ Quantization completed!")

### Compare Results

In [None]:
# Quantized model inference
with torch.no_grad():
    quantized_output = quantized_model(sample_input)

# Calculate sizes
original_size = sum(p.nelement() * p.element_size() for p in model.parameters()) / (1024 * 1024)
quantized_size = sum(p.nelement() * p.element_size() for p in quantized_model.parameters()) / (1024 * 1024)

# Calculate accuracy
diff = torch.abs(original_output - quantized_output).mean().item()

print("\n" + "="*60)
print("RESULTS")
print("="*60)
print(f"Original model size:  {original_size:.2f} MB")
print(f"Quantized model size: {quantized_size:.2f} MB")
print(f"Size reduction:       {(1 - quantized_size/original_size) * 100:.1f}%")
print(f"\nOutput difference:    {diff:.6f}")
print(f"Outputs close:        {torch.allclose(original_output, quantized_output, rtol=0.1)}")
print("="*60)

### Benchmark Speed

In [None]:
def benchmark(model, input_tensor, iterations=1000):
    """Benchmark model inference speed"""
    # Warmup
    with torch.no_grad():
        for _ in range(10):
            model(input_tensor)
    
    # Benchmark
    start = time.time()
    with torch.no_grad():
        for _ in range(iterations):
            model(input_tensor)
    elapsed = (time.time() - start) / iterations * 1000
    return elapsed

# Benchmark both models
original_time = benchmark(model, sample_input)
quantized_time = benchmark(quantized_model, sample_input)

print(f"Original model:  {original_time:.4f} ms/inference")
print(f"Quantized model: {quantized_time:.4f} ms/inference")
print(f"Speedup:         {original_time/quantized_time:.2f}x")

## Example 2: CNN Model

Now let's try quantizing a convolutional neural network.

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc = nn.Linear(64 * 8 * 8, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Create CNN
cnn_model = SimpleCNN()
cnn_model.eval()

# Sample input (32x32 RGB image)
cnn_input = torch.randn(1, 3, 32, 32)

print(f"CNN model created")
print(f"Input shape: {cnn_input.shape}")
print(f"Parameters: {sum(p.numel() for p in cnn_model.parameters()):,}")

In [None]:
# Quantize CNN with static quantization
def calibration_data():
    """Generate calibration data"""
    for _ in range(10):
        yield [torch.randn(4, 3, 32, 32)]

backend.model = cnn_model

try:
    quantized_cnn = backend.quantize(
        type="static",
        calibration_data=calibration_data()
    )
    print("âœ“ CNN quantization completed!")
    
    # Test
    with torch.no_grad():
        orig_out = cnn_model(cnn_input)
        quant_out = quantized_cnn(cnn_input)
    
    print(f"Original prediction: {orig_out.argmax(dim=1).item()}")
    print(f"Quantized prediction: {quant_out.argmax(dim=1).item()}")
    print(f"Predictions match: {orig_out.argmax(dim=1) == quant_out.argmax(dim=1)}")
    
except Exception as e:
    print(f"Note: Static quantization requires specific setup: {str(e)[:100]}")

## Example 3: LSTM Model

Finally, let's quantize an LSTM model for sequence processing.

In [None]:
class SimpleLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(50, 128, 2, batch_first=True)
        self.fc = nn.Linear(128, 10)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

# Create LSTM
lstm_model = SimpleLSTM()
lstm_model.eval()

# Sample input (batch_size=1, seq_length=20, features=50)
lstm_input = torch.randn(1, 20, 50)

print(f"LSTM model created")
print(f"Input shape: {lstm_input.shape}")
print(f"Parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")

In [None]:
# Quantize LSTM
backend.model = lstm_model

quantized_lstm = backend.quantize(
    type="dynamic",
    dtype="qint8"
)

print("âœ“ LSTM quantization completed!")

# Test
with torch.no_grad():
    orig_out = lstm_model(lstm_input)
    quant_out = quantized_lstm(lstm_input)

# Calculate sizes
original_size = sum(p.nelement() * p.element_size() for p in lstm_model.parameters()) / (1024 * 1024)
quantized_size = sum(p.nelement() * p.element_size() for p in quantized_lstm.parameters()) / (1024 * 1024)

print(f"\nOriginal size:  {original_size:.2f} MB")
print(f"Quantized size: {quantized_size:.2f} MB")
print(f"Size reduction: {(1 - quantized_size/original_size) * 100:.1f}%")

# Benchmark
original_time = benchmark(lstm_model, lstm_input, iterations=100)
quantized_time = benchmark(quantized_lstm, lstm_input, iterations=100)

print(f"\nOriginal time:  {original_time:.4f} ms")
print(f"Quantized time: {quantized_time:.4f} ms")
print(f"Speedup:        {original_time/quantized_time:.2f}x")

## Summary

Congratulations! You've learned how to:

âœ… Quantize fully connected networks
âœ… Quantize CNNs with static quantization
âœ… Quantize LSTMs with dynamic quantization
âœ… Measure size reduction and speedup
âœ… Validate accuracy preservation

### Next Steps

1. Try quantizing your own models
2. Experiment with different quantization methods
3. Check out more examples in `examples/` directory
4. Read the documentation for advanced features

### Key Takeaways

- **Dynamic quantization** is easiest and works great for LSTMs/Transformers
- **Static quantization** provides best performance for CNNs
- Typical results: **50-75% smaller**, **2-3x faster**
- Minimal accuracy loss (< 1% typically)

Happy compressing! ðŸš€