# Library installation

In [23]:
# !pip install transformers accelerate optimum-quanto
# !pip install torch

# GPU Check

In [24]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model Quantization 
(weights-only quantization with 4-bit integer precision)

In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from optimum.quanto import QuantizedModelForCausalLM, qint4

model_id = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

quantization_config = QuantoConfig(weights="int4") # weight quantization
# quantization_config = QuantoConfig(activations="int4") # activation quantization

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")
model.to(device)
quantized_model = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.51it/s]


In [27]:
free, total = torch.cuda.mem_get_info(device)
mem_used_MB = (total - free) / 1024 ** 2
print(f"GPU memory utilization : {mem_used_MB/1024} GB")

GPU memory utilization : 13.676368713378906 GB


# Inference over quantized model

In [28]:
import time
t1 = time.time()

prompt = "What is multi-head attention in context of transformer?"

try:
    input_ids = tokenizer([prompt], return_tensors="pt")
    input_ids = input_ids.input_ids.to(quantized_model.device)  # Use model's device

    generated_tokens = quantized_model.generate(
        input_ids,
        max_length=50,
    )

    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

except RuntimeError as e:
    print(f"CUDA Error: {e}")
    print("Try moving model to CPU if GPU memory is insufficient")
    # Fallback to CPU if needed
    # quantized_model = quantized_model.cpu()
except Exception as e:
    print(f"Unexpected error: {e}")

t2 = time.time()
print(generated_text)
print(f"Took around {t2-t1} seconds.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


What is multi-head attention in context of transformer? [closed]
I am trying to understand the Transformer model and its attention mechanism. I have read many papers and articles on the Transformer but I am still not able to understand what is multi-head attention
Took around 11.860067129135132 seconds.


In [29]:
quantized_model.save_pretrained('./Llama-3-8B-quantized')