In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# 1. Free up memory first
torch.cuda.empty_cache()

# 2. Load with more efficient config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  # Saves more memory
)

# 3. Load components sequentially
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

# 4. Load base model with explicit device map
base_model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True  # Critical for memory efficiency
)

# 5. Load adapter separately
model = PeftModel.from_pretrained(
    base_model,
    "./outputs/tinyllama-lora",
    device_map="auto"
)

# 6. Merge only if needed (merging uses extra memory)
# model = model.merge_and_unload()  # Skip this for now