# D&D Summarization LoRA Training - Simple Case
# Training Llama 3.1 1B on a few transcript/summary pairs

In [1]:
import json
import torch
from pathlib import Path
import pandas as pd
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import numpy as np
from rouge_score import rouge_scorer
import dotenv
import os

# Load environment variables
dotenv.load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Check GPU availability


In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

CUDA available: True
GPU: NVIDIA GeForce RTX 5090
VRAM: 34.2 GB


## Step 1: Load and Prepare Data


In [3]:
def load_training_data(summaries_dir="../data/summaries", transcripts_dir="../data/combined_transcripts"):
    """Load the 14 training pairs from your summary JSON files."""
    print(f"Loading training data from {summaries_dir}")
    
    summaries_path = Path(summaries_dir)
    # print top 5 files in the directory
    print("Files in summaries directory:")
    for file in sorted(summaries_path.glob("chunk_*_summary.json"))[:5]:
        print(f" - {file.name}")

    training_pairs = []
    
    # Load all summary files
    for json_file in sorted(summaries_path.glob("chunk_*_summary.json")):
        print(f"Processing {json_file.name}")
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract the combined transcript (this will be our input)
            # You'll need to load the corresponding simple.json file for the transcript
            chunk_num = data['chunk_number']
            simple_file = summaries_path.parent / transcripts_dir / f"chunk_{chunk_num:02d}_simple.json"
            
            if simple_file.exists():
                with open(simple_file, 'r', encoding='utf-8') as f:
                    chunk_data = json.load(f)
                
                training_pair = {
                    'chunk_id': f"chunk_{chunk_num:02d}",
                    'input_text': chunk_data['combined_transcript'],
                    'target_summary': data['summary'],
                    'duration_minutes': data['duration_minutes'],
                    'word_count': data['word_count']
                }
                training_pairs.append(training_pair)            
                
        except Exception as e:
            print(f"Error loading {json_file}: {e}")

    print(f"Loaded {len(training_pairs)} training pairs")
    return training_pairs

In [4]:
# Load your data
training_data = load_training_data(transcripts_dir="../data/combined_transcripts_20min")

Loading training data from ../data/summaries
Files in summaries directory:
 - chunk_01_summary.json
 - chunk_02_summary.json
 - chunk_03_summary.json
 - chunk_04_summary.json
 - chunk_05_summary.json
Processing chunk_01_summary.json
Processing chunk_02_summary.json
Processing chunk_03_summary.json
Processing chunk_04_summary.json
Processing chunk_05_summary.json
Processing chunk_06_summary.json
Processing chunk_07_summary.json
Processing chunk_08_summary.json
Processing chunk_09_summary.json
Processing chunk_10_summary.json
Processing chunk_11_summary.json
Processing chunk_12_summary.json
Processing chunk_13_summary.json
Processing chunk_14_summary.json
Loaded 14 training pairs


In [5]:
# Display first example

example = training_data[0]
print(f"\n📋 Example Training Pair:")
print(f"Chunk: {example['chunk_id']}")
print(f"Duration: {example['duration_minutes']:.1f} minutes")
print(f"Input length: {len(example['input_text'])} chars")
print(f"Summary length: {len(example['target_summary'])} chars")
print(f"\nInput preview: {example['input_text'][:200]}...")
print(f"\nTarget summary preview: {example['target_summary'][:200]}...")


📋 Example Training Pair:
Chunk: chunk_01
Duration: 20.0 minutes
Input length: 1191 chars
Summary length: 3161 chars

Input preview: \n\n=== File 1: Critical Role plays Daggerheart ｜ Live One-Shot ｜ Open Beta_chunk_0_300_seconds (t=0.0s) ===\n[0.0s - 300.0s] \n\n\n=== File 2: Critical Role plays Daggerheart ｜ Live One-Shot ｜ Open B...

Target summary preview: In this thrilling 20-minute session of Critical Role's Daggerheart one-shot, the party embarked on an exhilarating adventure filled with mystery and magic. As they celebrated their nine-year anniversa...


## Step 2: Setup Model and Tokenizer

In [6]:
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"  # or "meta-llama/Llama-3.2-1B"
# MODEL_NAME = "microsoft/DialoGPT-medium" 
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

print(f"\n🤖 Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=hf_token 
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token


🤖 Loading model: meta-llama/Llama-3.2-1B-Instruct


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
# # Load model in 4-bit for memory efficiency (optional, remove if you have enough VRAM)
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,  # Remove this line if you want full precision
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    use_auth_token=hf_token  # Use your Hugging Face token for private models
)

print(f"Model loaded on: {model.device}")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded on: cuda:0


## Step 3: Configure LoRA


In [8]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                    # Rank - start small for 14 samples
    lora_alpha=32,          # Scaling parameter
    lora_dropout=0.1,       # Dropout for regularization
    target_modules=[        # Target attention modules
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",        # Also target MLP for better performance
        "up_proj",
        "down_proj"
    ],
    bias="none",
    use_rslora=False,       # Set to True for better stability with larger ranks
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


## Step 4: Create Training Dataset


In [9]:
def create_training_prompt(input_text, target_summary):
    """Create a formatted prompt for training."""
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at summarizing Dungeons & Dragons sessions. Create engaging, detailed summaries that capture the story progression, character moments, combat encounters, and future plot hooks.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize this D&D session transcript in 300-500 words:

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{target_summary}<|eot_id|>"""
    return prompt

In [10]:
def tokenize_function(examples):
    """Tokenize the training examples."""
    # Create full prompts
    prompts = [create_training_prompt(inp, target) 
               for inp, target in zip(examples['input_text'], examples['target_summary'])]
    
    # Tokenize with truncation for long sequences
    tokenized = tokenizer(
        prompts,
        truncation=True,
        padding=True,
        max_length=2048,#4096,  # Adjust based on your GPU memory
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

In [11]:
# Convert to HuggingFace Dataset
df = pd.DataFrame(training_data)
dataset = Dataset.from_pandas(df)

# Tokenize dataset
print("\n🔤 Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Dataset size: {len(tokenized_dataset)}")
print(f"Average sequence length: {np.mean([len(x) for x in tokenized_dataset['input_ids']]):.0f}")


🔤 Tokenizing dataset...


Map: 100%|██████████| 14/14 [00:00<00:00, 283.50 examples/s]

Dataset size: 14
Average sequence length: 2048





In [12]:
# Split into train/validation (with only 14 samples, we'll use a simple split)
train_size = int(0.8 * len(tokenized_dataset))  # 11 for training, 3 for validation
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

Training samples: 11
Validation samples: 3


## Step 5: Training Configuration


In [13]:
training_args = TrainingArguments(
    output_dir="./dnd_lora_checkpoints",
    num_train_epochs=10,              # More epochs for small dataset
    per_device_train_batch_size=4,   # Small batch due to long sequences
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,   # Accumulate gradients to simulate larger batch size, effectively 8 per step
    learning_rate=2e-4,              # Standard LoRA learning rate
    weight_decay=0.01,
    logging_steps=1,                 # Log every step for small dataset
    eval_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=2,                  # Small warmup for small dataset
    fp16=False,                      # Use bf16 instead if supported
    bf16=True,
    dataloader_num_workers=0,        # Avoid multiprocessing issues
    remove_unused_columns=False,
    report_to=None,                  # Disable wandb/tensorboard for now
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8,  # Pad to multiple of 8 for better performance
)

## Step 6: Initialize Trainer


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("\n🚀 Starting training...")
print(f"Training {len(train_dataset)} samples for {training_args.num_train_epochs} epochs")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🚀 Starting training...
Training 11 samples for 10 epochs
Effective batch size: 8


## Step 7: Train the Model


In [17]:
# Start training
training_output = trainer.train()

print(f"\n✅ Training completed!")
print(f"Final training loss: {training_output.training_loss:.4f}")

# Save the final model
trainer.save_model("./dnd_lora_final")
tokenizer.save_pretrained("./dnd_lora_final")

print("💾 Model saved to ./dnd_lora_final")


Epoch,Training Loss,Validation Loss
1,2.2474,2.141451
2,2.0468,2.016145
3,1.9353,1.954644
4,1.8499,1.908041
5,1.7903,1.865223
6,1.7935,1.836725
7,1.6209,1.81753
8,1.4406,1.806164
9,1.6172,1.79996
10,1.6131,1.798006



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file


✅ Training completed!
Final training loss: 1.7984



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.


💾 Model saved to ./dnd_lora_final


## Step 8: Test the Trained Model


In [18]:
def generate_summary(model, tokenizer, input_text, max_length=2048):
    """Generate a summary using the trained model."""
    # Create the prompt (without the target summary)
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at summarizing Dungeons & Dragons sessions. Create engaging, detailed summaries that capture the story progression, character moments, combat encounters, and future plot hooks.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize this D&D session transcript in 300-500 words:

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3584)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the generated part only
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text.strip()

In [22]:
# Test on a validation example
if len(eval_dataset) > 0:
    print("\n🧪 Testing the trained model...")
    
    # Get a test example
    test_idx = 0
    test_example = training_data[train_size + test_idx]  # Use validation example
    
    print(f"\nTest example: {test_example['chunk_id']}")
    print(f"Input length: {len(test_example['input_text'])} chars")
    
    # Generate summary
    generated_summary = generate_summary(
        model, tokenizer, 
        test_example['input_text'][:2000],  # Truncate for testing
        max_length=4096
    )
    
    print(f"\n📝 Generated Summary:")
    print(generated_summary)
    
    print(f"\n📚 Reference Summary:")
    print(test_example['target_summary'][:4096] + "...")
    
    # Quick ROUGE evaluation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(test_example['target_summary'], generated_summary)
    
    print(f"\n📊 ROUGE Scores:")
    for metric, score in scores.items():
        print(f"  {metric.upper()}: {score.fmeasure:.3f}")


🧪 Testing the trained model...

Test example: chunk_12
Input length: 25225 chars

📝 Generated Summary:
\n\n=== File 1: Critical Role plays Daggerheart ｜ Live One-Shot ｜ Open Beta_chunk_44_300_seconds (t=119.6s) ===\n[120.1s - 122.0s] Is it you?\n[122.5s - 123.0s] I think I am.\n[123.2s - 123.4s] I think I am.\n[123.8s - 124.1s] You're going to be okay.\n[124.6s - 125.4s] I'm going to be okay.\n[125.6s - 126.3s] So you're not going to be okay.\n[126.6s - 127.4s] You're going to be okay.\n[127.7s - 128.1s] I'm going to be okay.\n[128.5s - 129.6s] You're going to be okay.\n[130.4s - 131.1s] You're going to be okay.\n[131.3s - 133.5s] So you're not going to be okay.\n[133.9s - 135.2s] You're not going to be okay.\n[135.3s - 136.2s] You're not going to be okay.\n[136.3s - 136.5s] You're going to be okay.\n[136.6s - 137.4s] You're going to be okay.\n[137.6s - 139.3s] You're going to be okay.\n[139.7s - 140.8s] So you're not going to be okay.\n[141.0s - 141.3s] So you're not going to be okay

# Compare with Base Model

In [20]:
# Compare Base Model vs Fine-tuned Model Performance

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
import time

def load_base_model():
    """Load the original base model without LoRA."""
    print("📥 Loading base model...")
    
    base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
    base_tokenizer.pad_token = base_tokenizer.eos_token
    
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=hf_token,
        quantization_config=bnb_config,  # Same config as fine-tuned
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )
    
    print(f"✅ Base model loaded on: {base_model.device}")
    return base_model, base_tokenizer

def generate_summary_comparison(model, tokenizer, input_text, model_name, max_length=400):
    """Generate summary with consistent parameters for fair comparison."""
    
    # Truncate input to same length for both models
    input_text = input_text[:3000]
    
    # Use the same prompt format
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at summarizing Dungeons & Dragons sessions. Create engaging, detailed summaries that capture the story progression, character moments, combat encounters, and future plot hooks.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize this D&D session transcript in 300-500 words:

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate with identical settings
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.8,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
        )
    
    generation_time = time.time() - start_time
    
    # Decode
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    generated_text = generated_text.strip()
    
    # Clean up
    if "assistant" in generated_text:
        generated_text = generated_text.split("assistant")[-1].strip()
    
    print(f"\n🤖 {model_name} Summary (Generated in {generation_time:.2f}s):")
    print("=" * 60)
    print(generated_text[:500] + ("..." if len(generated_text) > 500 else ""))
    print("=" * 60)
    
    return generated_text, generation_time

def calculate_rouge_scores(reference, generated, model_name):
    """Calculate and display ROUGE scores."""
    if not generated or len(generated.strip()) < 10:
        print(f"⚠️ {model_name}: Generated text too short for evaluation")
        return None
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    
    print(f"\n📊 {model_name} ROUGE Scores:")
    rouge_dict = {}
    for metric, score in scores.items():
        print(f"  {metric.upper()}: {score.fmeasure:.3f}")
        rouge_dict[metric] = score.fmeasure
    
    return rouge_dict

def compare_models():
    """Run comprehensive comparison between base and fine-tuned models."""
    
    print("\n" + "="*80)
    print("🔍 BASE MODEL vs FINE-TUNED MODEL COMPARISON")
    print("="*80)
    
    # Load base model
    base_model, base_tokenizer = load_base_model()
    
    # Use your existing fine-tuned model (already loaded as 'model')
    print("✅ Fine-tuned model already loaded")
    
    # Test on multiple examples if available
    test_examples = []
    if len(eval_dataset) > 0:
        # Use validation examples
        for i in range(min(3, len(eval_dataset))):  # Test up to 3 examples
            test_examples.append(training_data[train_size + i])
    else:
        # Use some training examples
        for i in range(min(3, len(training_data))):
            test_examples.append(training_data[i])
    
    print(f"\n🧪 Testing on {len(test_examples)} examples...")
    
    all_base_scores = []
    all_finetuned_scores = []
    
    for idx, test_example in enumerate(test_examples):
        print(f"\n{'='*60}")
        print(f"TEST EXAMPLE {idx + 1}: {test_example['chunk_id']}")
        print(f"Input length: {len(test_example['input_text'])} chars")
        print(f"{'='*60}")
        
        # Generate with base model
        base_summary, base_time = generate_summary_comparison(
            base_model, base_tokenizer, 
            test_example['input_text'], 
            "BASE MODEL"
        )
        
        # Generate with fine-tuned model  
        finetuned_summary, ft_time = generate_summary_comparison(
            model, tokenizer,
            test_example['input_text'],
            "FINE-TUNED MODEL"
        )
        
        # Show reference
        print(f"\n📚 REFERENCE SUMMARY:")
        print("=" * 60)
        print(test_example['target_summary'][:500] + ("..." if len(test_example['target_summary']) > 500 else ""))
        print("=" * 60)
        
        # Calculate ROUGE scores
        base_scores = calculate_rouge_scores(test_example['target_summary'], base_summary, "BASE MODEL")
        ft_scores = calculate_rouge_scores(test_example['target_summary'], finetuned_summary, "FINE-TUNED MODEL")
        
        if base_scores and ft_scores:
            all_base_scores.append(base_scores)
            all_finetuned_scores.append(ft_scores)
            
            # Show improvement
            print(f"\n📈 IMPROVEMENT ANALYSIS:")
            for metric in ['rouge1', 'rouge2', 'rougeL']:
                improvement = ft_scores[metric] - base_scores[metric]
                direction = "📈" if improvement > 0 else "📉" if improvement < 0 else "➡️"
                print(f"  {metric.upper()}: {improvement:+.3f} {direction}")
        
        # Performance comparison
        print(f"\n⚡ GENERATION SPEED:")
        print(f"  Base model: {base_time:.2f}s")
        print(f"  Fine-tuned: {ft_time:.2f}s")
        
        print(f"\n" + "-"*80)
    
    # Overall comparison
    if all_base_scores and all_finetuned_scores:
        print(f"\n" + "="*80)
        print("📊 OVERALL PERFORMANCE COMPARISON")
        print("="*80)
        
        # Calculate averages
        metrics = ['rouge1', 'rouge2', 'rougeL']
        
        print(f"{'Metric':<12} {'Base':<8} {'Fine-tuned':<12} {'Improvement':<12} {'Status'}")
        print("-" * 60)
        
        for metric in metrics:
            base_avg = sum(scores[metric] for scores in all_base_scores) / len(all_base_scores)
            ft_avg = sum(scores[metric] for scores in all_finetuned_scores) / len(all_finetuned_scores)
            improvement = ft_avg - base_avg
            
            status = "✅ Better" if improvement > 0.01 else "❌ Worse" if improvement < -0.01 else "➡️ Similar"
            
            print(f"{metric.upper():<12} {base_avg:<8.3f} {ft_avg:<12.3f} {improvement:+.3f}      {status}")
        
        # Conclusion
        avg_improvement = sum(
            sum(ft_scores[m] for m in metrics) - sum(base_scores[m] for m in metrics)
            for base_scores, ft_scores in zip(all_base_scores, all_finetuned_scores)
        ) / (len(all_base_scores) * 3)
        
        print(f"\n🎯 CONCLUSION:")
        if avg_improvement > 0.02:
            print("✅ Fine-tuning was successful! The model shows clear improvement.")
        elif avg_improvement > 0.005:
            print("⚠️ Fine-tuning shows modest improvement. Consider more data or training.")
        else:
            print("❌ Fine-tuning didn't improve performance. Need more data or different approach.")
        
        print(f"📈 Average improvement across all metrics: {avg_improvement:+.3f}")
    
    # Clean up base model to free memory
    del base_model, base_tokenizer
    torch.cuda.empty_cache()
    print(f"\n🧹 Cleaned up base model from memory")

# Run the comparison
compare_models()


🔍 BASE MODEL vs FINE-TUNED MODEL COMPARISON
📥 Loading base model...
✅ Base model loaded on: cuda:0
✅ Fine-tuned model already loaded

🧪 Testing on 3 examples...

TEST EXAMPLE 1: chunk_12
Input length: 25225 chars

🤖 BASE MODEL Summary (Generated in 6.04s):
**The Quest Begins**

In the mist-shrouded mountains, a lone adventurer named Daggerheart finds themselves drawn to a mysterious spring nestled in the heart of the treacherous land. With a sense of foreboding, they approach the water's edge, where a tangled mess of dark vines forms a formidable barrier.

As they examine the spring, their eyes widen in confusion. A pulsating, viscious substance emanates from the center, seemingly alive and aglow. A feeling of unease settles in, but not quite fear...

🤖 FINE-TUNED MODEL Summary (Generated in 9.71s):
As you gaze into your eyes, you try to recall any memories of what might have occurred during the attack. Your mind is filled with fragmented images of the chaotic battle that followed the