In [None]:
!pip install torch transformers datasets accelerate

In [60]:
import torch
import torch.nn as nn
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import numpy as np

In [61]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [69]:
# Calculate model size (simple function)
def get_model_size_mb(model):
    total_params = sum(p.numel() for p in model.parameters())
    # Assume 4 bytes per parameter (float32)
    size_mb = (total_params * 4) / (1024 * 1024)
    return size_mb

original_size = get_model_size_mb(model)

In [63]:
test_texts = [
    "This movie is amazing!",
    "I hate this film.",
    "It's okay I guess."
]

inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

In [64]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [65]:
# Warmup run
with torch.no_grad():
    _ = model(**inputs)

In [70]:
times = []
for i in range(5):  # 5 runs for stability
    start = time.time()
    with torch.no_grad():
        outputs = model(**inputs)
        original_predictions = torch.softmax(outputs.logits, dim=-1)
    end = time.time()
    times.append(end - start)

original_time = np.mean(times)
time_std = np.std(times)

print(f"  Avg inference time: {original_time:.4f} ¬± {time_std:.4f} seconds")
print(f"  Sample prediction: {original_predictions[0].numpy()}")
print(f"  Model size: {original_size:.2f} MB")

  Avg inference time: 0.0616 ¬± 0.0073 seconds
  Sample prediction: [1.18231525e-04 9.99881744e-01]
  Model size: 255.41 MB


In [71]:
quantized_model = quantize_dynamic(
    model,              # Your original model
    {nn.Linear},        # Quantize Linear layers only (safe)
    dtype=torch.qint8   # Use INT8
)

In [73]:
quantized_size = get_model_size_mb(quantized_model)
compression_ratio = original_size / quantized_size

print(f"  Original:   {original_size:.2f} MB")
print(f"  Quantized:  {quantized_size:.2f} MB")
print(f"  Compression: {compression_ratio:.2f}x smaller")
print(f"  Reduction:  {((original_size - quantized_size) / original_size * 100):.1f}%")

  Original:   255.41 MB
  Quantized:  91.00 MB
  Compression: 2.81x smaller
  Reduction:  64.4%


In [74]:
quantized_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_lin): DynamicQuantizedLinear(in_features=768, out_featur

In [75]:
# Warmup run
with torch.no_grad():
    _ = quantized_model(**inputs)

In [76]:
quantized_times = []
for i in range(5):
    start = time.time()
    with torch.no_grad():
        outputs = quantized_model(**inputs)
        quantized_predictions = torch.softmax(outputs.logits, dim=-1)
    end = time.time()
    quantized_times.append(end - start)

quantized_time = np.mean(quantized_times)
quantized_std = np.std(quantized_times)

print(f"  Avg inference time: {quantized_time:.4f} ¬± {quantized_std:.4f} seconds")
print(f"  Sample prediction: {quantized_predictions[0].numpy()}")
print(f"  Model size: {quantized_size:.2f} MB")

  Avg inference time: 0.0335 ¬± 0.0021 seconds
  Sample prediction: [1.3081332e-04 9.9986923e-01]
  Model size: 91.00 MB


In [77]:
# Performance Comparison
print("\n" + "=" * 60)
print("PERFORMANCE COMPARISON")
print("=" * 60)

# Speed comparison
speedup = original_time / quantized_time
print(f"üöÄ Speed Results:")
print(f"  Original time:    {original_time:.4f} seconds")
print(f"  Quantized time:   {quantized_time:.4f} seconds")
print(f"  Speedup:          {speedup:.2f}x faster")

# Size comparison
print(f"\nüì¶ Size Results:")
print(f"  Original size:    {original_size:.2f} MB")
print(f"  Quantized size:   {quantized_size:.2f} MB")
print(f"  Compression:      {compression_ratio:.2f}x smaller")

# Accuracy comparison
accuracy_diff = np.mean(np.abs(original_predictions.numpy() - quantized_predictions.numpy()))
print(f"\nüéØ Accuracy Results:")
print(f"  Prediction difference: {accuracy_diff:.6f}")

if accuracy_diff < 0.001:
    print(f"  Status: ‚úÖ Excellent! Virtually no accuracy loss")
elif accuracy_diff < 0.01:
    print(f"  Status: ‚úÖ Very good! Minimal accuracy loss")
elif accuracy_diff < 0.05:
    print(f"  Status: ‚úÖ Good! Acceptable accuracy loss")
else:
    print(f"  Status: ‚ö†Ô∏è Moderate accuracy loss - check if acceptable")


PERFORMANCE COMPARISON
üöÄ Speed Results:
  Original time:    0.0616 seconds
  Quantized time:   0.0335 seconds
  Speedup:          1.84x faster

üì¶ Size Results:
  Original size:    255.41 MB
  Quantized size:   91.00 MB
  Compression:      2.81x smaller

üéØ Accuracy Results:
  Prediction difference: 0.000035
  Status: ‚úÖ Excellent! Virtually no accuracy loss


In [78]:
# Detailed prediction comparison
print("\n" + "=" * 60)
print("DETAILED PREDICTION COMPARISON")
print("=" * 60)

orig_preds = original_predictions.numpy()
quant_preds = quantized_predictions.numpy()

for i, text in enumerate(test_texts):
    print(f"\nText {i+1}: '{text}'")
    print(f"  Original:  [{orig_preds[i][0]:.6f}, {orig_preds[i][1]:.6f}]")
    print(f"  Quantized: [{quant_preds[i][0]:.6f}, {quant_preds[i][1]:.6f}]")

    # Check if predictions match
    orig_class = "Positive" if orig_preds[i][1] > 0.5 else "Negative"
    quant_class = "Positive" if quant_preds[i][1] > 0.5 else "Negative"

    if orig_class == quant_class:
        print(f"  Result: ‚úÖ {orig_class} (predictions match)")
    else:
        print(f"  Result: ‚ö†Ô∏è Mismatch! Original: {orig_class}, Quantized: {quant_class}")

    # Calculate confidence difference
    conf_diff = abs(orig_preds[i][1] - quant_preds[i][1])
    print(f"  Confidence difference: {conf_diff:.6f}")


DETAILED PREDICTION COMPARISON

Text 1: 'This movie is amazing!'
  Original:  [0.000118, 0.999882]
  Quantized: [0.000131, 0.999869]
  Result: ‚úÖ Positive (predictions match)
  Confidence difference: 0.000013

Text 2: 'I hate this film.'
  Original:  [0.999687, 0.000313]
  Quantized: [0.999680, 0.000320]
  Result: ‚úÖ Negative (predictions match)
  Confidence difference: 0.000007

Text 3: 'It's okay I guess.'
  Original:  [0.000229, 0.999771]
  Quantized: [0.000314, 0.999686]
  Result: ‚úÖ Positive (predictions match)
  Confidence difference: 0.000085
