In [None]:
# Install the required libraries
!pip install torch
!pip install transformers
!pip install optimum

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.int8 import INCQuantizer

# Hugging Face model id
model_id = "philschmid/llama-2-7b-instruction-generator"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model in float32 precision (since we're using CPU)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32
).to('cpu')

# Initialize the quantizer
quantizer = INCQuantizer.from_pretrained(model_id)

# Quantize the model to int8 (This will work on CPU)
quantized_model = quantizer.quantize(model, dtype=torch.qint8)

# Save the quantized model to disk
save_folder = "quantized_llama"
quantized_model.save_pretrained(save_folder)

# Save the tokenizer to the same folder
tokenizer.save_pretrained(save_folder)

In [None]:
# Test the quantized model
def generate_text(prompt, max_length=50):
    # Load the quantized model
    quantized_model = AutoModelForCausalLM.from_pretrained(
        save_folder, torch_dtype=torch.qint8
    ).to("cpu")

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(save_folder)

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

    # Generate text
    outputs = quantized_model.generate(**inputs, max_length=max_length)

    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Example prompt to test the model
prompt = "The future of AI is"
generated_text = generate_text(prompt)
print("Generated Text:", generated_text)