
## Fine-Tuning LLM for Museum Audio Guide Generation
## Step-by-step notebook for training a model to generate engaging audio guides for paintings

#### Step 4: Prepare Training Data
##### We'll convert our JSON training examples into the proper format for fine-tuning


In [1]:
import json
import os
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(" Loading training data...")
with open('raw-data/training_examples_bulk.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

print(f" Loaded {len(training_data)} training examples")


 Loading training data...
 Loaded 938 training examples


In [3]:
# Let's examine our data structure
print("\n Sample training example:")
sample = training_data[0]
print("PROMPT:")
print(sample['prompt'])
print("\nCOMPLETION:")  
print(sample['completion'])


 Sample training example:
PROMPT:
Title: Self-Portrait with a Straw Hat (obverse: The Potato Peeler)
Artist: Vincent van Gogh
Date: 1887
Medium: Oil on canvas
Dimensions: 16 x 12 1/2 in. (40.6 x 31.8 cm)

Audio guide:

COMPLETION:
 Meet Van Gogh in his Paris self-portrait from 1887, measuring just 16 by 12 inches. Here, he experiments with Impressionist techniques, using short, vibrant brushstrokes and a lighter palette than his later works. Notice the confident gaze and the casual straw hat—symbols of his artistic evolution. The reverse side reveals The Potato Peeler, showing Van Gogh's resourceful use of canvas during his financially constrained years. <END>


In [4]:
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(training_data)
print(f"Data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Check for any issues in the data
print("Data quality check:")
print(f"- Examples with empty prompts: {df['prompt'].isin(['', None]).sum()}")
print(f"- Examples with empty completions: {df['completion'].isin(['', None]).sum()}")
print(f"- Average prompt length: {df['prompt'].str.len().mean():.0f} characters")
print(f"- Average completion length: {df['completion'].str.len().mean():.0f} characters")


Data shape: (938, 2)
Columns: ['prompt', 'completion']
Data quality check:
- Examples with empty prompts: 0
- Examples with empty completions: 0
- Average prompt length: 229 characters
- Average completion length: 387 characters


In [5]:
from sklearn.model_selection import train_test_split

print("\nSplitting data into train/validation sets...")

# 90% train, 10% validation
train_data, val_data = train_test_split(
    training_data, 
    test_size=0.1, 
    random_state=42
)

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")

# Save splits for reference
os.makedirs('processed-data', exist_ok=True)

with open('processed-data/train_data.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)
    
with open('processed-data/val_data.json', 'w', encoding='utf-8') as f:
    json.dump(val_data, f, indent=2, ensure_ascii=False)

print("Saved train/validation splits to processed-data/")


Splitting data into train/validation sets...
Training examples: 844
Validation examples: 94
Saved train/validation splits to processed-data/


In [6]:
# Load Base Model
print("Loading base model and tokenizer...")

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig
)
import torch

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # Faster downloads
os.environ["BITSANDBYTES_NOWELCOME"] = "1"
os.environ["BNB_CUDA_VERSION"] = "121"

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

print("Attempting optimized model loading...")

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
print("4-bit quantization config:")
print(f"- Quantization type: NF4 (normalized float 4-bit)")
print(f"- Double quantization: Enabled") 
print(f"- Compute dtype: float16")

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)


Loading base model and tokenizer...
Attempting optimized model loading...
4-bit quantization config:
- Quantization type: NF4 (normalized float 4-bit)
- Double quantization: Enabled
- Compute dtype: float16


This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.80s/it]


In [7]:
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print(f"Loaded tokenizer for {MODEL_NAME}")
    
    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print("Added padding token")
        
    print(f"Vocabulary size: {tokenizer.vocab_size}")
    print(f"EOS token: '{tokenizer.eos_token}'")
    
except Exception as e:
    print(f"Error loading tokenizer: {e}")

Loaded tokenizer for mistralai/Mistral-7B-Instruct-v0.3
Added padding token
Vocabulary size: 32768
EOS token: '</s>'


In [8]:
# Step 7: Format Data for Training
# Convert our prompt/completion format to the format expected by the model

def format_training_example(example):
    """Format a single training example for causal language modeling"""
    
    # Combine prompt and completion into single text
    # This is the format for causal LM training
    full_text = example['prompt'] + example['completion']
    
    return {
        'text': full_text,
        'prompt': example['prompt'],
        'completion': example['completion']
    }

print("\nFormatting data for training...")

# Format training and validation data
formatted_train = [format_training_example(ex) for ex in train_data]
formatted_val = [format_training_example(ex) for ex in val_data]

print(f"Formatted {len(formatted_train)} training examples")
print(f"Formatted {len(formatted_val)} validation examples")

# Create Hugging Face datasets
train_dataset = Dataset.from_list(formatted_train)
val_dataset = Dataset.from_list(formatted_val)

print(f"\nCreated datasets:")
print(f"- Training: {len(train_dataset)} examples")
print(f"- Validation: {len(val_dataset)} examples")

# Show a formatted example
print(f"\nSample formatted text (first 200 chars):")
print(repr(formatted_train[0]['text'][:200]))


Formatting data for training...
Formatted 844 training examples
Formatted 94 validation examples

Created datasets:
- Training: 844 examples
- Validation: 94 examples

Sample formatted text (first 200 chars):
'Title: "Isfandiyar\'s Third Course: He Slays a Dragon", Folio 434v from the Shahnama (Book of Kings) of Shah Tahmasp\nArtist: Abu\'l Qasim Firdausi\nDate: ca. 1530\nMedium: Opaque watercolor, ink, silver, '


In [9]:
# Step 8: Tokenize the Data (Fixed Version)
# Convert text to tokens that the model can understand

def tokenize_function(examples):
    """Tokenize the text data with proper prompt masking"""
    
    # Tokenize the full text (prompt + completion)
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',  
        max_length=512,
        return_tensors=None
    )
    
    # Tokenize just the prompts (to find where completion starts)
    prompt_tokenized = tokenizer(
        examples['prompt'],
        truncation=True,
        padding=False,  # Don't pad prompts
        max_length=512,
        add_special_tokens=False,  # Don't add extra tokens
        return_tensors=None
    )
    
    # Create labels with prompt masking
    labels = []
    for i in range(len(tokenized['input_ids'])):
        # Get the full sequence and prompt length
        full_ids = tokenized['input_ids'][i]
        prompt_length = len(prompt_tokenized['input_ids'][i])
        
        # Create label sequence
        label_ids = full_ids.copy()
        
        # Mask prompt tokens (set to -100 so they're ignored in loss)
        for j in range(min(prompt_length, len(label_ids))):
            label_ids[j] = -100
        
        labels.append(label_ids)
    
    tokenized['labels'] = labels
    return tokenized

print("\nTokenizing datasets...")

# Tokenize in batches for efficiency
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=val_dataset.column_names
)

print(f"Tokenized training data: {len(tokenized_train)} examples")
print(f"Tokenized validation data: {len(tokenized_val)} examples")

# Verify tokenization worked
print("\n Tokenization verification:")
sample = tokenized_train[0]
print(f"Input length: {len(sample['input_ids'])}")
print(f"Label length: {len(sample['labels'])}")
print(f"Masked tokens (prompt): {sum(1 for x in sample['labels'] if x == -100)}")
print(f"Training tokens (completion): {sum(1 for x in sample['labels'] if x != -100 and x != tokenizer.pad_token_id)}")


Tokenizing datasets...


Map: 100%|██████████| 844/844 [00:00<00:00, 2954.44 examples/s]
Map: 100%|██████████| 94/94 [00:00<00:00, 4110.31 examples/s]

Tokenized training data: 844 examples
Tokenized validation data: 94 examples

 Tokenization verification:
Input length: 512
Label length: 512
Masked tokens (prompt): 198
Training tokens (completion): 314





In [10]:
# Step 9: Configure PEFT/LoRA for Efficient Fine-tuning
# LoRA (Low-Rank Adaptation) allows us to fine-tune efficiently by only training small adapter layers

from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch

print("\nConfiguring LoRA (Low-Rank Adaptation)...")

model = prepare_model_for_kbit_training(model)


# LoRA Configuration  
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

print("LoRA Configuration:")
print(f"- Rank (r): {lora_config.r}")
print(f"- Alpha: {lora_config.lora_alpha}")
print(f"- Dropout: {lora_config.lora_dropout}")
print(f"- Target modules: {lora_config.target_modules}")

# Apply LoRA to the model
print("\nApplying LoRA adapters to model...")
model = get_peft_model(model, lora_config)

# Enable gradient computation for LoRA parameters
model.enable_input_require_grads()

# Check trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())

print(f"Parameter Statistics:")
print(f"- Trainable parameters: {trainable_params:,}")
print(f"- Total parameters: {all_params:,}")
print(f"- Trainable percentage: {100 * trainable_params / all_params:.2f}%")

# Check memory usage after LoRA
if torch.cuda.is_available():
    vram_used = torch.cuda.memory_allocated() / 1024**3
    print(f"VRAM after LoRA: {vram_used:.2f} GB")


Configuring LoRA (Low-Rank Adaptation)...
LoRA Configuration:
- Rank (r): 16
- Alpha: 32
- Dropout: 0.1
- Target modules: {'o_proj', 'k_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'v_proj'}

Applying LoRA adapters to model...
Parameter Statistics:
- Trainable parameters: 41,943,040
- Total parameters: 3,800,305,664
- Trainable percentage: 1.10%
VRAM after LoRA: 4.51 GB


In [11]:
# Step 10: Configure Training Parameters
# Set up training arguments for our fine-tuning process

from transformers import TrainingArguments

print("\nConfiguring training parameters...")

# Create output directory
output_dir = "./trained-models/fine-tuned-mistral-paintings"
os.makedirs(output_dir, exist_ok=True)

# Training arguments optimized for 12GB VRAM
training_args = TrainingArguments(
    # Output and logging
    output_dir=output_dir,
    run_name="mistral-paintings-audio-guide",
    
    # Training schedule
    num_train_epochs=3,                    # Number of complete passes through data
    max_steps=-1,                          # -1 means use num_train_epochs instead
    
    # Batch sizes (adjust based on VRAM)
    per_device_train_batch_size=2,         # Batch size per GPU for training
    per_device_eval_batch_size=2,          # Batch size per GPU for evaluation
    gradient_accumulation_steps=4,         # Effective batch size = 2 * 4 = 8
    
    # Learning rate and optimization
    learning_rate=2e-4,                    # Learning rate for LoRA (higher than full fine-tuning)
    weight_decay=0.01,                     # L2 regularization
    warmup_steps=50,                       # Learning rate warmup
    
    # Evaluation and saving
    eval_strategy="steps",           # Evaluate every N steps
    eval_steps=100,                        # Evaluate every 100 steps
    save_strategy="steps",                 # Save every N steps
    save_steps=200,                        # Save every 200 steps
    save_total_limit=3,                    # Keep only 3 checkpoints
    
    # Optimization settings
    dataloader_drop_last=True,             # Drop incomplete batches
    fp16=True,                             # Use mixed precision training
    gradient_checkpointing=True,           # Save memory by recomputing activations
    
    # Logging
    logging_steps=25,                      # Log every 25 steps
    logging_dir=f"{output_dir}/logs",      # TensorBoard logs
    report_to=None,                        # Disable wandb/tensorboard for now
    
    # Efficiency settings
    load_best_model_at_end=True,          # Load best checkpoint at end
    metric_for_best_model="eval_loss",     # Use validation loss as metric
)

print("Training configuration:")
print(f"- Epochs: {training_args.num_train_epochs}")
print(f"- Batch size per device: {training_args.per_device_train_batch_size}")
print(f"- Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"- Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"- Learning rate: {training_args.learning_rate}")
print(f"- Mixed precision: {training_args.fp16}")



Configuring training parameters...
Training configuration:
- Epochs: 3
- Batch size per device: 2
- Gradient accumulation: 4
- Effective batch size: 8
- Learning rate: 0.0002
- Mixed precision: True


In [12]:
# Step 11: Set up Data Collator
# Prepare data batching for training

from transformers import DataCollatorForLanguageModeling

print("\nSetting up data collator...")

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8,  # Pad to multiple of 8 for efficiency
)

print("Data collator configured for causal language modeling")

## Step 12: Initialize Trainer
# Set up the training loop

from transformers import Trainer

print("\nInitializing trainer...")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer, 
)

print(" Trainer initialized successfully!")

# Print training summary
print(f"\n Training Summary:")
print(f"- Model: {MODEL_NAME} with LoRA")
print(f"- Training examples: {len(tokenized_train)}")
print(f"- Validation examples: {len(tokenized_val)}")
print(f"- Trainable parameters: {trainable_params:,}")
print(f"- Estimated VRAM usage: ~7-8GB")
print(f"- Output directory: {output_dir}")

print("\nReady to start training!")
print("Next: Run trainer.train() to begin fine-tuning")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Setting up data collator...
Data collator configured for causal language modeling

Initializing trainer...
 Trainer initialized successfully!

 Training Summary:
- Model: mistralai/Mistral-7B-Instruct-v0.3 with LoRA
- Training examples: 844
- Validation examples: 94
- Trainable parameters: 41,943,040
- Estimated VRAM usage: ~7-8GB
- Output directory: ./trained-models/fine-tuned-mistral-paintings

Ready to start training!
Next: Run trainer.train() to begin fine-tuning


In [13]:
# Step 13: Start Training

print("Starting training...")

trainer.train()

trainer.save_model()  
model.save_pretrained(output_dir)  
tokenizer.save_pretrained(output_dir) 

print(f"Complete model saved to {output_dir}")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,0.3588,0.374975
200,0.2296,0.303825
300,0.1244,0.269072


Complete model saved to ./trained-models/fine-tuned-mistral-paintings


In [14]:
## Test with the model still in memory from training

print("Testing the trained model (still in memory)...")

# Use the model that was just trained (should still be in memory)
# Make sure we're in eval mode
model.eval()

# Test prompt
test_prompt = """Title: Girl with a Pearl Earring
Artist: Johannes Vermeer
Date: c. 1665
Medium: Oil on canvas
Dimensions: 17.5 × 15.6 in (44.5 × 39.4 cm)

Audio guide:"""

# Tokenize input
inputs = tokenizer(test_prompt, return_tensors="pt")

# Move to same device as model
if torch.cuda.is_available():
    inputs = {k: v.cuda() for k, v in inputs.items()}

# Generate
print("Generating audio guide...")
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=80,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
completion = generated_text[len(test_prompt):].strip()

print("\n" + "="*50)
print("GENERATED AUDIO GUIDE:")
print("="*50)
print(completion)
print("="*50)

Testing the trained model (still in memory)...
Generating audio guide...

GENERATED AUDIO GUIDE:
Discover Girl with a Pearl Earring, where Johannes Vermeer's refined technique shines through this 17.5 × 15.6 in (44.5 × 39.4 cm) Oil on canvas. Created in c. 1665, the color harmony demonstrates skill Notice the careful attention to detail reflecting the artistic values


In [None]:
## Save model properly for future use

print("Saving model properly...")

# Save the adapter only (recommended approach)
model.save_pretrained("./trained-models/paintings-audio-guide-final")

# Save tokenizer
tokenizer.save_pretrained("./trained-models/paintings-audio-guide-final")

print("Model saved to ./trained-models/paintings-audio-guide-final")

Saving model properly...
Model saved to ./paintings-audio-guide-final
