In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import psutil

In [2]:
def get_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024)

In [3]:
print(f"Memory usage before loading model: {get_memory_usage():.2f} MB")

# Load the model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print(f"Memory usage after loading model: {get_memory_usage():.2f} MB")

Memory usage before loading model: 355.29 MB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after loading model: 26090.16 MB


In [14]:
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / (1024 ** 2)  # Convert bytes to MB
    else:
        # For CPU-based memory usage (using the process size)
        import os, psutil
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / (1024 ** 2)  # Convert bytes to MB

# Check memory usage before loading the model
print(f"Memory usage before loading model: {get_memory_usage():.2f} MB")

# Load the model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Check memory usage after loading the model
print(f"Memory usage after loading model: {get_memory_usage():.2f} MB")

# Check memory usage of model weights (GPU)
if torch.cuda.is_available():
    model_memory = sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # 4 bytes for float32
    print(f"Model weights memory: {model_memory:.2f} MB (GPU)")
else:
    model_memory = sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # 4 bytes for float32
    print(f"Model weights memory: {model_memory:.2f} MB (CPU)")


Memory usage before loading model: 0.00 MB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after loading model: 0.00 MB
Model weights memory: 25705.02 MB (GPU)


In [15]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [4]:
# Set the model to evaluation mode
model.eval()

# Define quantization configuration
qconfig = torch.quantization.get_default_qconfig('fbgemm')
qconfig_dict = {'': qconfig}

# Prepare the model for static quantization
model_prepared = torch.quantization.prepare(model, qconfig_dict)



In [5]:
# Calibration function
def calibrate(model_prepared, sample_inputs):
    with torch.no_grad():
        model_prepared(**sample_inputs)

In [16]:
# Prepare calibration data
input_text = "lion is the king of jungal,"
sample_inputs = tokenizer(input_text, return_tensors="pt")

# Perform calibration
print("Performing calibration...")
for _ in range(10):  # Run multiple calibration iterations
    calibrate(model, sample_inputs)

Performing calibration...


In [18]:
# Convert the model to quantized version
quantized_model = torch.quantization.convert(model_prepared)

print(f"Memory usage after static quantization: {get_memory_usage():.2f} MB")

Memory usage after static quantization: 0.00 MB


In [19]:
if torch.cuda.is_available():
    model_memory = sum(p.numel() for p in quantized_model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # 4 bytes for float32
    print(f"Model weights memory: {model_memory:.2f} MB (GPU)")
else:
    model_memory = sum(p.numel() for p in quantized_model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # 4 bytes for float32
    print(f"Model weights memory: {model_memory:.2f} MB (CPU)")

Model weights memory: 25705.02 MB (GPU)


In [17]:
# Test the quantized model
with torch.no_grad():
    outputs = model.generate(**sample_inputs, max_length=50)

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

lion is the king of jungal, but it is not the king of the world, the king of the world is human.
The Lion is the king of the jungle but the king of the world is man.



In [None]:
# Compare model sizes
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

print(f"Original model size: {get_model_size(model):.2f} MB")
print(f"Quantized model size: {get_model_size(quantized_model):.2f} MB")