In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

print("Loading with proper quantization config...")

# Proper quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

# Load base model with proper config
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load your trained adapter
model = PeftModel.from_pretrained(base_model, "./paintings-audio-guide-final")
model.eval()

print("Model loaded successfully!")

# Test function
def test_model(title, artist, date, medium, dimensions="Not specified"):
    prompt = f"""Title: {title}
Artist: {artist}
Date: {date}
Medium: {medium}
Dimensions: {dimensions}

Audio guide:"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completion = result[len(prompt):].strip()
    
    # Clean up if it has <END>
    if "<END>" in completion:
        completion = completion.split("<END>")[0].strip()
    
    return completion

# Test 1: Girl with a Pearl Earring
print("=" * 60)
print("TEST 1: Girl with a Pearl Earring")
print("=" * 60)
result1 = test_model(
    "Girl with a Pearl Earring",
    "Johannes Vermeer",
    "c. 1665",
    "Oil on canvas",
    "17.5 × 15.6 in (44.5 × 39.4 cm)"
)
print(result1)

# Test 2: The Starry Night  
print("\n" + "=" * 60)
print("TEST 2: The Starry Night")
print("=" * 60)
result2 = test_model(
    "The Starry Night",
    "Vincent van Gogh",
    "1889",
    "Oil on canvas", 
    "29 × 36 1/4 in (73.7 × 92.1 cm)"
)
print(result2)

# Test 3: From your training data
print("\n" + "=" * 60)
print("TEST 3: Dutch Girl in White (From Training)")
print("=" * 60)
result3 = test_model(
    "Dutch Girl in White",
    "Robert Henri", 
    "1907",
    "Oil on canvas",
    "24 x 20 in. (61 x 50.8 cm)"
)
print(result3)

Loading with proper quantization config...


OSError: libcudart.so.11.0: cannot open shared object file: No such file or directory

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

print("Loading without quantization...")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"PyTorch CUDA version: {torch.version.cuda}")

# Load base model WITHOUT quantization
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Base model loaded")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded")

# Load your trained adapter
model = PeftModel.from_pretrained(base_model, "./paintings-audio-guide-final")
model.eval()

print("Model with adapter loaded successfully!")

# Check VRAM usage
if torch.cuda.is_available():
    vram_used = torch.cuda.memory_allocated() / 1024**3
    vram_reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"🎮 VRAM allocated: {vram_used:.2f} GB")
    print(f"🎮 VRAM reserved: {vram_reserved:.2f} GB")

# Test function
def test_model(title, artist, date, medium, dimensions="Not specified"):
    prompt = f"""Title: {title}
Artist: {artist}
Date: {date}
Medium: {medium}
Dimensions: {dimensions}

Audio guide:"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completion = result[len(prompt):].strip()
    
    if "<END>" in completion:
        completion = completion.split("<END>")[0].strip()
    
    return completion

# Test your fine-tuned model
print("\n" + "=" * 60)
print("TESTING YOUR FINE-TUNED MODEL")
print("=" * 60)

result = test_model(
    "Girl with a Pearl Earring",
    "Johannes Vermeer",
    "c. 1665",
    "Oil on canvas",
    "17.5 × 15.6 in (44.5 × 39.4 cm)"
)

print("Generated audio guide:")
print(result)
print("=" * 60)

Loading without quantization...
CUDA available: True
PyTorch CUDA version: 12.6


Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.30s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


Base model loaded
Tokenizer loaded


KeyError: 'base_model.model.model.lm_head'

: 