# Install Dependencies

In [1]:
import torch
import time
import json
import nltk
import os
import platform
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)

# Define fixed test cases
test_prompts = [
    {"id": 1, "prompt": "Explain how neural networks learn.", "max_new_tokens": 80},
    {"id": 2, "prompt": "Write a haiku about machine learning.", "max_new_tokens": 60},
    {"id": 3, "prompt": "What is quantization in deep learning?", "max_new_tokens": 100}
]

def get_system_specs():
    """Get system specifications"""
    specs = {
        "os": f"{platform.system()} {platform.release()}",
        "processor": platform.processor(),
        "cpu_cores_physical": psutil.cpu_count(logical=False),
        "cpu_cores_logical": psutil.cpu_count(logical=True),
        "ram_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "ram_available_gb": round(psutil.virtual_memory().available / (1024**3), 2),
        "python_version": platform.python_version(),
        "pytorch_version": torch.__version__,
        "cuda_available": torch.cuda.is_available(),
        "cuda_version": torch.version.cuda if torch.cuda.is_available() else None,
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None
    }
    return specs

def get_model_size_mb(model):
    """Calculate model size in MB"""
    param_size = 0
    buffer_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_mb = (param_size + buffer_size) / (1024**2)
    return round(size_mb, 2)

def get_model_params(model):
    """Get total parameters count"""
    return sum(p.numel() for p in model.parameters())

# Get and display system specs
system_specs = get_system_specs()
print("üñ•Ô∏è SYSTEM SPECIFICATIONS")
print("="*50)
for key, value in system_specs.items():
    print(f"   {key}: {value}")

üñ•Ô∏è SYSTEM SPECIFICATIONS
   os: Windows 10
   processor: AMD64 Family 23 Model 17 Stepping 0, AuthenticAMD
   cpu_cores_physical: 4
   cpu_cores_logical: 8
   ram_total_gb: 13.67
   ram_available_gb: 1.57
   python_version: 3.12.4
   pytorch_version: 2.5.1+cpu
   cuda_available: False
   cuda_version: None
   gpu_name: None


In [3]:
test_prompts

[{'id': 1,
  'prompt': 'Explain how neural networks learn.',
  'max_new_tokens': 80},
 {'id': 2,
  'prompt': 'Write a haiku about machine learning.',
  'max_new_tokens': 60},
 {'id': 3,
  'prompt': 'What is quantization in deep learning?',
  'max_new_tokens': 100}]

**Load Tokenizer & FP16 Model CPU Only**

In [3]:
model_id = "Qwen/Qwen2-1.5B-Instruct"

print("‚è≥ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("‚è≥ Loading FP16 model on CPU (this will take 2-5 mins)...")
# Load model in float16 but place on CPU to avoid OOM
model_fp16 = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},  # Force CPU
    low_cpu_mem_usage=True
)
print("‚úÖ FP16 model loaded on CPU.")

# Get FP16 model size
fp16_size_mb = get_model_size_mb(model_fp16)
fp16_params = get_model_params(model_fp16)
print(f"\nüìè FP16 Model Size: {fp16_size_mb} MB")
print(f"üìä Total Parameters: {fp16_params:,}")

‚è≥ Loading tokenizer...
‚è≥ Loading FP16 model on CPU (this will take 2-5 mins)...
‚úÖ FP16 model loaded on CPU.

üìè FP16 Model Size: 2944.41 MB
üìä Total Parameters: 1,543,714,304


**Function to generate text and measure time**

In [4]:
def generate_text(model, tokenizer, prompt, max_new_tokens=100, device="cpu"):
    # Format prompt for Mistral
    formatted_prompt = f"[INST] {prompt} [/INST]"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

    start = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    latency = time.time() - start

    generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    num_tokens = outputs.shape[1] - inputs["input_ids"].shape[1]

    return {
        "text": generated.strip(),
        "latency": round(latency, 3),
        "tokens": num_tokens,
        "tokens_per_sec": round(num_tokens / latency, 2)
    }

**FP16 (baseline) inference**

In [5]:
print("üîµ Running FP16 (baseline) inference on CPU...")
baseline_results = []

for case in test_prompts:
    print(f"  Prompt {case['id']}: '{case['prompt']}'")
    result = generate_text(model_fp16, tokenizer, case["prompt"], case["max_new_tokens"], device="cpu")
    result["id"] = case["id"]
    baseline_results.append(result)
    print(f"    ‚Üí {result['latency']}s | {result['tokens_per_sec']} tok/s")

# Delete FP16 model to free RAM
del model_fp16
torch.cuda.empty_cache()

üîµ Running FP16 (baseline) inference on CPU...
  Prompt 1: 'Explain how neural networks learn.'
    ‚Üí 438.287s | 0.18 tok/s
  Prompt 2: 'Write a haiku about machine learning.'
    ‚Üí 345.274s | 0.17 tok/s
  Prompt 3: 'What is quantization in deep learning?'
    ‚Üí 537.283s | 0.19 tok/s


In [6]:
print("‚öôÔ∏è Applying INT8 dynamic quantization (PyTorch native)...")

# Since bitsandbytes requires CUDA on Windows and CUDA is not available,
# we'll use PyTorch's native dynamic quantization for CPU

# Reload the model for quantization (since we deleted fp16 model)
model_for_quant = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,  # Load in FP32 for quantization
    device_map={"": "cpu"},
    low_cpu_mem_usage=True
)

# Get FP32 model size before quantization
fp32_size_mb = get_model_size_mb(model_for_quant)
print(f"üìè FP32 Model Size (before quantization): {fp32_size_mb} MB")

# Apply dynamic quantization to Linear layers
model_int8 = torch.quantization.quantize_dynamic(
    model_for_quant,
    {torch.nn.Linear},  # Quantize only Linear layers
    dtype=torch.qint8
)

# Get INT8 model size after quantization
int8_size_mb = get_model_size_mb(model_int8)
int8_params = get_model_params(model_int8)
compression_ratio = fp32_size_mb / int8_size_mb if int8_size_mb > 0 else 0

print(f"üìè INT8 Model Size (after quantization): {int8_size_mb} MB")
print(f"üìâ Compression Ratio: {compression_ratio:.2f}x")
print(f"üíæ Memory Saved: {fp32_size_mb - int8_size_mb:.2f} MB")

# Clean up the original model
del model_for_quant
torch.cuda.empty_cache()

print("‚úÖ INT8 dynamically quantized model ready.")

‚öôÔ∏è Applying INT8 dynamic quantization (PyTorch native)...
üìè FP32 Model Size (before quantization): 5888.81 MB
üìè INT8 Model Size (after quantization): 890.59 MB
üìâ Compression Ratio: 6.61x
üíæ Memory Saved: 4998.22 MB
‚úÖ INT8 dynamically quantized model ready.


In [7]:
device = "cpu"  # INT8 quantization runs on CPU
print(f"üü¢ Running INT8 inference on {device.upper()}...")

quant_results = []
for case in test_prompts:
    print(f"  Prompt {case['id']}: '{case['prompt']}'")
    result = generate_text(model_int8, tokenizer, case["prompt"], case["max_new_tokens"], device=device)
    result["id"] = case["id"]
    quant_results.append(result)
    print(f"    ‚Üí {result['latency']}s | {result['tokens_per_sec']} tok/s")

üü¢ Running INT8 inference on CPU...
  Prompt 1: 'Explain how neural networks learn.'
    ‚Üí 19.3s | 4.15 tok/s
  Prompt 2: 'Write a haiku about machine learning.'
    ‚Üí 14.906s | 4.03 tok/s
  Prompt 3: 'What is quantization in deep learning?'
    ‚Üí 26.314s | 3.8 tok/s


In [8]:
# Save the quantized model
save_path = "Qwen2-1.5B-Instruct-INT8"

print(f"üíæ Saving quantized model to '{save_path}'...")

# Create directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the quantized model state dict
torch.save(model_int8.state_dict(), os.path.join(save_path, "pytorch_model_int8.bin"))

# Save the tokenizer
tokenizer.save_pretrained(save_path)

# Save model config
model_int8.config.save_pretrained(save_path)

# Calculate saved model file size
saved_model_path = os.path.join(save_path, "pytorch_model_int8.bin")
saved_size_mb = os.path.getsize(saved_model_path) / (1024**2)

print(f"‚úÖ Quantized model saved!")
print(f"üìÇ Location: {os.path.abspath(save_path)}")
print(f"üìè Saved model file size: {saved_size_mb:.2f} MB")

üíæ Saving quantized model to 'Qwen2-1.5B-Instruct-INT8'...
‚úÖ Quantized model saved!
üìÇ Location: c:\Users\LENOVO\Documents\Assignment test\Qwen2-1.5B-Instruct-INT8
üìè Saved model file size: 2363.14 MB


In [9]:
# Use the complete save function to add model card and loader script
model_info = save_quantized_model_complete(
    model=model_int8,
    tokenizer=tokenizer,
    save_path="Qwen2-1.5B-Instruct-INT8",
    model_id=model_id,
    system_specs=system_specs
)

print("\n" + "="*60)
print("üìã MODEL CARD SUMMARY")
print("="*60)
for key, value in model_info.items():
    if isinstance(value, dict):
        print(f"\n{key}:")
        for k, v in value.items():
            print(f"   {k}: {v}")
    else:
        print(f"{key}: {value}")

üíæ Saving quantized model to 'Qwen2-1.5B-Instruct-INT8'...
   ‚úÖ Model weights saved
   ‚úÖ Tokenizer saved
   ‚úÖ Model config saved
   ‚úÖ Model card saved
   ‚úÖ Loader script saved

‚úÖ Model completely saved to: c:\Users\LENOVO\Documents\Assignment test\Qwen2-1.5B-Instruct-INT8
üìè Total model size: 2363.14 MB
üçì Raspberry Pi compatible: ‚úÖ Yes

üìã MODEL CARD SUMMARY
model_id: Qwen/Qwen2-1.5B-Instruct
quantization: PyTorch Dynamic INT8
model_file: pytorch_model_int8.bin
model_size_mb: 2363.14

minimum_requirements:
   ram_gb: 4.6
   storage_gb: 3.3
   python_version: 3.8+
   pytorch_version: 1.9+

compatible_devices:
   raspberry_pi_5_8gb: True
   raspberry_pi_4_8gb: True
   raspberry_pi_4_4gb: False
   jetson_nano: True
   orange_pi_5: True
   desktop_8gb_ram: True

usage_instructions:
   load_command: torch.load('pytorch_model_int8.bin')
   inference_device: cpu
   expected_tokens_per_sec: 1-5 on Raspberry Pi, 5-15 on desktop

created_on:
   os: Windows 10
   processor:

# üçì Device Compatibility & Requirements

## Minimum Hardware Requirements for INT8 Quantized Model

| Device | RAM Required | Storage | CPU | Recommended |
|--------|-------------|---------|-----|-------------|
| **Raspberry Pi 5** | 8GB | 4GB+ | ARM Cortex-A76 | ‚úÖ Yes |
| **Raspberry Pi 4** | 8GB | 4GB+ | ARM Cortex-A72 | ‚ö†Ô∏è Slow but works |
| **Raspberry Pi 4** | 4GB | 4GB+ | ARM Cortex-A72 | ‚ùå Not enough RAM |
| **Jetson Nano** | 4GB | 4GB+ | ARM + GPU | ‚ö†Ô∏è Use GPU instead |
| **Orange Pi 5** | 8GB+ | 4GB+ | RK3588 | ‚úÖ Yes |
| **Desktop/Laptop** | 8GB+ | 4GB+ | x86_64 | ‚úÖ Yes |

## Model Size Summary
- **FP32 (Original)**: ~6 GB RAM needed
- **FP16 (Half Precision)**: ~3 GB RAM needed  
- **INT8 (Quantized)**: ~1.5-2 GB RAM needed
- **INT4 (Further Quantized)**: ~0.8-1 GB RAM needed

## For Raspberry Pi Deployment
```bash
# Install dependencies on Raspberry Pi
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install transformers accelerate
```

In [2]:
def save_quantized_model_complete(model, tokenizer, save_path, model_id, system_specs=None):
    """
    Complete function to save a quantized model with all necessary files.
    
    Args:
        model: The quantized PyTorch model
        tokenizer: The tokenizer
        save_path: Directory path to save the model
        model_id: Original model identifier
        system_specs: Optional system specifications dict
    
    Returns:
        dict: Information about the saved model
    """
    import os
    import json
    import torch
    
    print(f"üíæ Saving quantized model to '{save_path}'...")
    
    # Create directory
    os.makedirs(save_path, exist_ok=True)
    
    # 1. Save model weights
    model_file = os.path.join(save_path, "pytorch_model_int8.bin")
    torch.save(model.state_dict(), model_file)
    print(f"   ‚úÖ Model weights saved")
    
    # 2. Save tokenizer
    tokenizer.save_pretrained(save_path)
    print(f"   ‚úÖ Tokenizer saved")
    
    # 3. Save model config
    model.config.save_pretrained(save_path)
    print(f"   ‚úÖ Model config saved")
    
    # 4. Calculate sizes
    model_size_mb = os.path.getsize(model_file) / (1024**2)
    
    # 5. Create model card with device requirements
    model_card = {
        "model_id": model_id,
        "quantization": "PyTorch Dynamic INT8",
        "model_file": "pytorch_model_int8.bin",
        "model_size_mb": round(model_size_mb, 2),
        "minimum_requirements": {
            "ram_gb": max(2, round(model_size_mb / 1024 * 2, 1)),  # 2x model size for safety
            "storage_gb": round(model_size_mb / 1024 + 1, 1),  # Model + overhead
            "python_version": "3.8+",
            "pytorch_version": "1.9+"
        },
        "compatible_devices": {
            "raspberry_pi_5_8gb": True,
            "raspberry_pi_4_8gb": True,
            "raspberry_pi_4_4gb": model_size_mb < 2000,  # Only if model < 2GB
            "jetson_nano": True,
            "orange_pi_5": True,
            "desktop_8gb_ram": True
        },
        "usage_instructions": {
            "load_command": "torch.load('pytorch_model_int8.bin')",
            "inference_device": "cpu",
            "expected_tokens_per_sec": "1-5 on Raspberry Pi, 5-15 on desktop"
        }
    }
    
    if system_specs:
        model_card["created_on"] = system_specs
    
    # Save model card
    card_path = os.path.join(save_path, "model_card.json")
    with open(card_path, "w", encoding="utf-8") as f:
        json.dump(model_card, f, indent=2, ensure_ascii=False)
    print(f"   ‚úÖ Model card saved")
    
    # 6. Create a simple loader script
    loader_script = '''"""
Loader script for INT8 quantized model
Run on Raspberry Pi or any CPU device
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_PATH = "."  # Current directory
MODEL_ID = "{model_id}"

def load_quantized_model():
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    # Load base model architecture
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        device_map={{"": "cpu"}},
        low_cpu_mem_usage=True
    )
    
    # Apply quantization structure
    model = torch.quantization.quantize_dynamic(
        model, {{torch.nn.Linear}}, dtype=torch.qint8
    )
    
    # Load saved weights
    state_dict = torch.load("pytorch_model_int8.bin", map_location="cpu")
    model.load_state_dict(state_dict)
    
    return model, tokenizer

def generate(model, tokenizer, prompt, max_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=max_tokens,
            do_sample=True, temperature=0.7, top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

if __name__ == "__main__":
    print("Loading quantized model...")
    model, tokenizer = load_quantized_model()
    print("Model loaded! Enter 'quit' to exit.")
    
    while True:
        prompt = input("You: ")
        if prompt.lower() == 'quit':
            break
        response = generate(model, tokenizer, prompt)
        print(f"AI: {{response}}")
'''.format(model_id=model_id)
    
    loader_path = os.path.join(save_path, "load_and_chat.py")
    with open(loader_path, "w", encoding="utf-8") as f:
        f.write(loader_script)
    print(f"   ‚úÖ Loader script saved")
    
    print(f"\n‚úÖ Model completely saved to: {os.path.abspath(save_path)}")
    print(f"üìè Total model size: {model_size_mb:.2f} MB")
    print(f"üçì Raspberry Pi compatible: {'‚úÖ Yes' if model_size_mb < 3000 else '‚ö†Ô∏è May be slow'}")
    
    return model_card

# Display the function is ready
print("‚úÖ Function 'save_quantized_model_complete' defined and ready to use!")
print("\nUsage:")
print("  model_info = save_quantized_model_complete(model_int8, tokenizer, 'my_model_path', model_id)")
print("\nThis function saves:")
print("  - pytorch_model_int8.bin (model weights)")
print("  - tokenizer files")
print("  - config.json")
print("  - model_card.json (with device compatibility info)")
print("  - load_and_chat.py (ready-to-use script)")

‚úÖ Function 'save_quantized_model_complete' defined and ready to use!

Usage:
  model_info = save_quantized_model_complete(model_int8, tokenizer, 'my_model_path', model_id)

This function saves:
  - pytorch_model_int8.bin (model weights)
  - tokenizer files
  - config.json
  - model_card.json (with device compatibility info)
  - load_and_chat.py (ready-to-use script)


In [11]:
def bigram_similarity(text1, text2):
    """
    Calculate bigram (2-gram) cosine similarity between two texts.
    
    Why bigram similarity can be low:
    - LLMs generate text stochastically (using sampling with temperature/top_p)
    - Even with same prompt, different runs produce different word sequences
    - Bigram similarity compares consecutive word pairs, not semantic meaning
    - Small vocabulary overlap = low similarity even if meaning is similar
    - Different phrasing of same concept = low bigram overlap
    """
    vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
    try:
        X = vectorizer.fit_transform([text1, text2])
        sim = cosine_similarity(X[0], X[1]).item()
        return round(sim, 3)
    except:
        return 0.0

print("\n" + "="*80)
print("üìä QUANTIZATION COMPARISON REPORT")
print("="*80)


all_results = []
for base, quant in zip(baseline_results, quant_results):
    sim = bigram_similarity(base["text"], quant["text"])
    speedup = base["latency"] / quant["latency"] if quant["latency"] > 0 else 0

    print(f"\n{'='*80}")
    print(f"üî∏ Prompt {base['id']}: {test_prompts[base['id']-1]['prompt']}")
    print(f"{'='*80}")
    
    print(f"\nüìù FP16 Response:")
    print(f"   {base['text'][:500]}{'...' if len(base['text']) > 500 else ''}")
    print(f"   ‚è±Ô∏è Latency: {base['latency']}s | Tokens/sec: {base['tokens_per_sec']}")
    
    print(f"\nüìù INT8 Response:")
    print(f"   {quant['text'][:500]}{'...' if len(quant['text']) > 500 else ''}")
    print(f"   ‚è±Ô∏è Latency: {quant['latency']}s | Tokens/sec: {quant['tokens_per_sec']}")
    
    print(f"\nüìä Metrics:")
    print(f"   Speedup: {speedup:.2f}x | Bigram Similarity: {sim}")

    all_results.append({
        "id": base["id"],
        "prompt": test_prompts[base["id"]-1]["prompt"],
        "fp16_response": base["text"],
        "int8_response": quant["text"],
        "fp16_latency": base["latency"],
        "fp16_tokens_per_sec": base["tokens_per_sec"],
        "int8_latency": quant["latency"],
        "int8_tokens_per_sec": quant["tokens_per_sec"],
        "bigram_similarity": sim,
        "speedup": round(speedup, 2)
    })


üìä QUANTIZATION COMPARISON REPORT

üî∏ Prompt 1: Explain how neural networks learn.

üìù FP16 Response:
   Neural networks, also known as artificial neural networks (ANNs), are a type of machine learning algorithm that are inspired by the structure and function of the human brain. They consist of interconnected nodes or units called neurons that process input data through multiple layers of computation.

When training an ANN, it learns to make predictions based on patterns in the training data. This is achieved through a process called backpropagation
   ‚è±Ô∏è Latency: 438.287s | Tokens/sec: 0.18

üìù INT8 Response:
   The process of learning in neural networks involves the use of backpropagation and optimization algorithms such as stochastic gradient descent to minimize the error between predicted outputs and actual outputs.
Neural networks are trained using a set of labeled examples, where each example consists of input data and its corresponding output label. During training,

In [12]:
report = {
    "model": "Qwen/Qwen2-1.5B-Instruct",
    "quantization_method": "PyTorch Dynamic INT8 Quantization",
    "system_specifications": system_specs,
    "model_sizes": {
        "fp16_size_mb": fp16_size_mb,
        "fp32_size_mb": fp32_size_mb,
        "int8_size_mb": int8_size_mb,
        "compression_ratio": round(compression_ratio, 2),
        "memory_saved_mb": round(fp32_size_mb - int8_size_mb, 2),
        "total_parameters": fp16_params,
        "saved_model_path": os.path.abspath(save_path),
        "saved_model_file_size_mb": round(saved_size_mb, 2)
    },
    "device_fp16": "CPU",
    "device_int8": "CPU",
    "note_on_bigram_similarity": (
        "Bigram similarity may be low because LLM generation is stochastic. "
        "Each inference run uses sampling (temperature, top_p) which produces different word sequences. "
        "Bigram similarity measures consecutive word-pair overlap, not semantic meaning. "
        "Two responses can convey the same information with completely different phrasing, "
        "resulting in low bigram similarity but high semantic equivalence."
    ),
    "test_cases": all_results
}

with open("llm_quantization_report.json", "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print("\n‚úÖ Full report saved to 'llm_quantization_report.json'")
print("\nüìã Report includes:")
print("   - System specifications (OS, CPU, RAM, GPU)")
print("   - Model sizes (FP16, FP32, INT8) and compression ratio")
print("   - Saved quantized model location")
print("   - Full generated responses for FP16 and INT8")
print("   - Latency and throughput metrics")
print("   - Bigram similarity scores with explanation")


‚úÖ Full report saved to 'llm_quantization_report.json'

üìã Report includes:
   - System specifications (OS, CPU, RAM, GPU)
   - Model sizes (FP16, FP32, INT8) and compression ratio
   - Saved quantized model location
   - Full generated responses for FP16 and INT8
   - Latency and throughput metrics
   - Bigram similarity scores with explanation
