# D&D Summarization LoRA Training - Simple Case
# Training Llama 3.1 1B on a few transcript/summary pairs

In [7]:
import json
import torch
from pathlib import Path
import pandas as pd
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import numpy as np
from rouge_score import rouge_scorer
import dotenv
import os

# Load environment variables
dotenv.load_dotenv()

True

# Check GPU availability


In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

CUDA available: True
GPU: NVIDIA GeForce RTX 5090
VRAM: 34.2 GB


## Step 1: Load and Prepare Data


In [3]:
def load_training_data(summaries_dir="../data/summaries", transcripts_dir="../data/combined_transcripts"):
    """Load the 14 training pairs from your summary JSON files."""
    print(f"Loading training data from {summaries_dir}")
    
    summaries_path = Path(summaries_dir)
    # print top 5 files in the directory
    print("Files in summaries directory:")
    for file in sorted(summaries_path.glob("chunk_*_summary.json"))[:5]:
        print(f" - {file.name}")

    training_pairs = []
    
    # Load all summary files
    for json_file in sorted(summaries_path.glob("chunk_*_summary.json")):
        print(f"Processing {json_file.name}")
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract the combined transcript (this will be our input)
            # You'll need to load the corresponding simple.json file for the transcript
            chunk_num = data['chunk_number']
            simple_file = summaries_path.parent / transcripts_dir / f"chunk_{chunk_num:02d}_simple.json"
            
            if simple_file.exists():
                with open(simple_file, 'r', encoding='utf-8') as f:
                    chunk_data = json.load(f)
                
                training_pair = {
                    'chunk_id': f"chunk_{chunk_num:02d}",
                    'input_text': chunk_data['combined_transcript'],
                    'target_summary': data['summary'],
                    'duration_minutes': data['duration_minutes'],
                    'word_count': data['word_count']
                }
                training_pairs.append(training_pair)            
                
        except Exception as e:
            print(f"Error loading {json_file}: {e}")

    print(f"Loaded {len(training_pairs)} training pairs")
    return training_pairs

In [4]:
# Load your data
training_data = load_training_data(transcripts_dir="../data/combined_transcripts_20min")

Loading training data from ../data/summaries
Files in summaries directory:
 - chunk_01_summary.json
 - chunk_02_summary.json
 - chunk_03_summary.json
 - chunk_04_summary.json
 - chunk_05_summary.json
Processing chunk_01_summary.json
Processing chunk_02_summary.json
Processing chunk_03_summary.json
Processing chunk_04_summary.json
Processing chunk_05_summary.json
Processing chunk_06_summary.json
Processing chunk_07_summary.json
Processing chunk_08_summary.json
Processing chunk_09_summary.json
Processing chunk_10_summary.json
Processing chunk_11_summary.json
Processing chunk_12_summary.json
Processing chunk_13_summary.json
Processing chunk_14_summary.json
Loaded 14 training pairs


In [5]:
# Display first example

example = training_data[0]
print(f"\n📋 Example Training Pair:")
print(f"Chunk: {example['chunk_id']}")
print(f"Duration: {example['duration_minutes']:.1f} minutes")
print(f"Input length: {len(example['input_text'])} chars")
print(f"Summary length: {len(example['target_summary'])} chars")
print(f"\nInput preview: {example['input_text'][:200]}...")
print(f"\nTarget summary preview: {example['target_summary'][:200]}...")


📋 Example Training Pair:
Chunk: chunk_01
Duration: 20.0 minutes
Input length: 1191 chars
Summary length: 3161 chars

Input preview: \n\n=== File 1: Critical Role plays Daggerheart ｜ Live One-Shot ｜ Open Beta_chunk_0_300_seconds (t=0.0s) ===\n[0.0s - 300.0s] \n\n\n=== File 2: Critical Role plays Daggerheart ｜ Live One-Shot ｜ Open B...

Target summary preview: In this thrilling 20-minute session of Critical Role's Daggerheart one-shot, the party embarked on an exhilarating adventure filled with mystery and magic. As they celebrated their nine-year anniversa...


## Step 2: Setup Model and Tokenizer

In [9]:
# MODEL_NAME = "meta-llama/Llama-3.1-1B"  # or "meta-llama/Llama-3.1-1B-Instruct"
MODEL_NAME = "microsoft/DialoGPT-medium" 
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

print(f"\n🤖 Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=hf_token 
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token


🤖 Loading model: microsoft/DialoGPT-medium


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
# # Load model in 4-bit for memory efficiency (optional, remove if you have enough VRAM)
# from transformers import BitsAndBytesConfig

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,  # Remove this line if you want full precision
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

print(f"Model loaded on: {model.device}")


Model loaded on: cuda:0


## Step 3: Configure LoRA


In [13]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                    # Rank - start small for 14 samples
    lora_alpha=32,          # Scaling parameter
    lora_dropout=0.1,       # Dropout for regularization
    target_modules=[        # Target attention modules
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",        # Also target MLP for better performance
        "up_proj",
        "down_proj"
    ],
    bias="none",
    use_rslora=False,       # Set to True for better stability with larger ranks
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

ValueError: Target modules {'gate_proj', 'down_proj', 'up_proj', 'k_proj', 'o_proj', 'q_proj', 'v_proj'} not found in the base model. Please check the target modules and try again.

## Step 4: Create Training Dataset


In [14]:
def create_training_prompt(input_text, target_summary):
    """Create a formatted prompt for training."""
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at summarizing Dungeons & Dragons sessions. Create engaging, detailed summaries that capture the story progression, character moments, combat encounters, and future plot hooks.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize this D&D session transcript in 300-500 words:

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{target_summary}<|eot_id|>"""
    return prompt

In [15]:
def tokenize_function(examples):
    """Tokenize the training examples."""
    # Create full prompts
    prompts = [create_training_prompt(inp, target) 
               for inp, target in zip(examples['input_text'], examples['target_summary'])]
    
    # Tokenize with truncation for long sequences
    tokenized = tokenizer(
        prompts,
        truncation=True,
        padding=False,
        max_length=4096,  # Adjust based on your GPU memory
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

In [None]:
# Convert to HuggingFace Dataset
df = pd.DataFrame(training_data)
dataset = Dataset.from_pandas(df)

# Tokenize dataset
print("\n🔤 Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Dataset size: {len(tokenized_dataset)}")
print(f"Average sequence length: {np.mean([len(x) for x in tokenized_dataset['input_ids']]):.0f}")

In [None]:
# Split into train/validation (with only 14 samples, we'll use a simple split)
train_size = int(0.8 * len(tokenized_dataset))  # 11 for training, 3 for validation
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

## Step 5: Training Configuration


In [None]:
training_args = TrainingArguments(
    output_dir="./dnd_lora_checkpoints",
    num_train_epochs=5,              # More epochs for small dataset
    per_device_train_batch_size=1,   # Small batch due to long sequences
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,   # Effective batch size = 8
    learning_rate=2e-4,              # Standard LoRA learning rate
    weight_decay=0.01,
    logging_steps=1,                 # Log every step for small dataset
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=2,                  # Small warmup for small dataset
    fp16=False,                      # Use bf16 instead if supported
    bf16=True,
    dataloader_num_workers=0,        # Avoid multiprocessing issues
    remove_unused_columns=False,
    report_to=None,                  # Disable wandb/tensorboard for now
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

## Step 6: Initialize Trainer


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("\n🚀 Starting training...")
print(f"Training {len(train_dataset)} samples for {training_args.num_train_epochs} epochs")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

## Step 7: Train the Model


In [None]:
# Start training
training_output = trainer.train()

print(f"\n✅ Training completed!")
print(f"Final training loss: {training_output.training_loss:.4f}")

# Save the final model
trainer.save_model("./dnd_lora_final")
tokenizer.save_pretrained("./dnd_lora_final")

print("💾 Model saved to ./dnd_lora_final")


## Step 8: Test the Trained Model


In [None]:
def generate_summary(model, tokenizer, input_text, max_length=512):
    """Generate a summary using the trained model."""
    # Create the prompt (without the target summary)
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at summarizing Dungeons & Dragons sessions. Create engaging, detailed summaries that capture the story progression, character moments, combat encounters, and future plot hooks.<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize this D&D session transcript in 300-500 words:

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3584)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the generated part only
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text.strip()

In [None]:
# Test on a validation example
if len(eval_dataset) > 0:
    print("\n🧪 Testing the trained model...")
    
    # Get a test example
    test_idx = 0
    test_example = training_data[train_size + test_idx]  # Use validation example
    
    print(f"\nTest example: {test_example['chunk_id']}")
    print(f"Input length: {len(test_example['input_text'])} chars")
    
    # Generate summary
    generated_summary = generate_summary(
        model, tokenizer, 
        test_example['input_text'][:2000],  # Truncate for testing
        max_length=300
    )
    
    print(f"\n📝 Generated Summary:")
    print(generated_summary)
    
    print(f"\n📚 Reference Summary:")
    print(test_example['target_summary'][:300] + "...")
    
    # Quick ROUGE evaluation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(test_example['target_summary'], generated_summary)
    
    print(f"\n📊 ROUGE Scores:")
    for metric, score in scores.items():
        print(f"  {metric.upper()}: {score.fmeasure:.3f}")