# Lab 4: LLM Fine-tuning with LoRA and Evaluation

This lab covers fine-tuning language models using LoRA (Low-Rank Adaptation), data selection strategies, and comprehensive model evaluation comparing fine-tuned vs base models.

## 1. Environment and setup

Authenticate with Hugging Face and verify CUDA/GPU availability.

In [None]:
# Load Hugging Face API key from environment (do NOT hardcode your token here).
import os
import logging, warnings
from transformers import logging as hf_logging

# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"  

from huggingface_hub import login
from dotenv import load_dotenv

# Load .env file (if present)
load_dotenv()
hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")

### Verify CUDA/GPU availability

Make sure we have CUDA available and a GPU.

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"PyTorch built with CUDA: {torch.version.cuda}")


## 2. Model initialization and quantization

Initialize quantization, tokenizer, and load the base model.

### Model initialization

Initialize quantization, tokenizer, and load the base model using GPT2.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import copy

# Quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


# Lets use GPT2
model_name = "gpt2"  # This model uses safetensors format

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model with safetensors (safer format)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    use_safetensors=True,  # Force use of safetensors
)
model = copy.deepcopy(base_model)  # Create a copy of the base model


## 3. Data and LoRA configuration

Fetch/prepare data and configure LoRA adapters.

### Fetch dataset repository

Clone the repository below for the data we will use in this lab.

In [None]:
# Run git clone https://github.com/ericsunkuan/ML_Spring2025_HW5.git

### LoRA configuration

Set LoRA rank and scaling, and apply adapters to attention modules.

In [None]:

# Define LoRA configuration with tweaked hyperparameters
lora_config_tweaked = LoraConfig(
    r=16,  ### TODO : Choose any number > 0 ! Common values are 4, 8, 16, 32, 64, 128. Higher ranks allow more expressive power but also increase parameter count.
    lora_alpha=16,  ### TODO : Choose any number > 0 ! Suggested 4, 8, 16, 32, 64, 128
    target_modules=["c_attn", "c_proj"],  # GPT-2 style attention modules for distilgpt2
    lora_dropout=0.1,  # Dropout for regularization
    bias="none",    # No bias adaptation
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config_tweaked)

# Print trainable parameters
model.print_trainable_parameters()

max_seq_length = 1024
dtype = torch.float16
load_in_4bit = True

print("LoRA adapter applied successfully with custom hyperparameters!")

## 4. Data processing and training setup

Load dataset, create helpers, filter/sort, and prepare train/eval splits.

### Load dataset and create helpers

Load the dataset and create helper functions to format data for training. Reserve the last 50 examples for validation.

In [None]:
if tokenizer.chat_template is None:
    # Define a simple chat template compatible with distilgpt2
    tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}Assistant:"

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

from datasets import Dataset, load_from_disk

# Load the dataset from Hugging Face
dataset = load_from_disk("../../ML_Spring2025_HW5/fastchat_alpaca_52k")

# ---------------------------
# FIRST: Reserve evaluation set from original dataset to prevent leakage
# ---------------------------
print(f"Original dataset size: {len(dataset)}")

# Take last 50 samples from original dataset for evaluation (before any sorting)
eval_dataset_original = dataset.select(range(len(dataset) - 50, len(dataset)))  # Last 50 samples
training_dataset_original = dataset.select(range(len(dataset) - 50))  # All except last 50

print(f"Reserved for evaluation (from original): {len(eval_dataset_original)}")
print(f"Available for training/sorting: {len(training_dataset_original)}")

# ---------------------------
# Add a "text" field to each example (for both training and evaluation datasets)
# ---------------------------
def add_text_field(example):
    # Extract the first message where role == 'assistant'
    assistant_texts = [msg["content"] for msg in example["conversations"] if msg["role"] == "assistant"]
    text = assistant_texts[0] if assistant_texts else ""
    return {"text": text}

# Map the function over BOTH training and evaluation datasets
training_dataset_original = training_dataset_original.map(add_text_field)
eval_dataset_original = eval_dataset_original.map(add_text_field)

# Print the dataset structure to confirm the new feature.
print(training_dataset_original)
print(f"Evaluation dataset structure: {eval_dataset_original}")

### Exercise 1: Filtering and sorting

Exercise 1a: Implement the data filtering function. The basic premise of fine-tuning is that we want high-quality data over a large quantity of data. Think of smart criteria to filter out low-quality data. Currently the filtering function is set to the length of the conversation, which is a bad criterion, since the model would overfit to large or small conversations. You can adjust this later, but for now, try to implement a more meaningful filtering criterion than purely the conversation length. 

Exercise 1b: Implement the sorting function. The sorting function is a mix of the filtering function and external metrics, for example the score in the dataset. The score is a human evaluation of the conversation, where a higher score means a better conversation.

In [None]:

#################### TODO : Define a meaningful helper filtering function that will be used in the sorting 

# The default "conversation length" here refers to the length of the input (human) and output (gpt), you can modify it at your will
def compute_conversation_length(example):
    # Compute total word count across all messages in the 'conversations' field
    return sum(len(message["content"].split()) for message in example["conversations"])

############## Advanced Sorting Method (TODO : Modify the sorting key) ##################

def advanced_sort_key(example):
    conversation_len = compute_conversation_length(example)
    score = example["score"]
    return conversation_len 



sorted_dataset_list = sorted(training_dataset_original, key=advanced_sort_key, reverse=True)
# Convert back to a Dataset object
sorted_dataset = Dataset.from_list(sorted_dataset_list)

print("\nTop examples sorted by advanced key (combination of conversation length and score):")
for entry in sorted_dataset.select(range(5)):
    print(f"ID: {entry['id']}, Advanced Key Value: {advanced_sort_key(entry)}")


### Exercise 2: Select training subset

Get 100 samples from your sorted dataset (choose a range consistent with your sorting).

In [None]:
################# TODO ###################################################################
train_dataset = sorted_dataset.select(range(0, 100))    ### You can experiment with different ranges

# Apply formatting directly
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset_original.map(formatting_prompts_func, batched=True)
print(f"Training dataset prepared with {len(train_dataset)} examples")
print(f"Evaluation dataset (reserved): {len(eval_dataset_original)} examples")
print(f"Sample formatted text:\n{train_dataset[0]['text']}...")

In [None]:
# print(train_dataset[0]["conversations"])
print(eval_dataset[0]["conversations"])
# print(train_dataset[0]["text"])
print(eval_dataset[0]["text"])

### Exercise 3: Configure training hyperparameters

Set the training parameters. You can experiment with different values for each parameter. 

In [None]:
from trl import SFTTrainer, SFTConfig

# Define bfloat16 support check without unsloth
def is_bfloat16_supported():
    """Check if bfloat16 is supported on current hardware"""
    if torch.cuda.is_available():
        # Check if GPU supports bfloat16
        return torch.cuda.get_device_capability()[0] >= 8  # Ampere and newer
    return False

################# TODO : Tweak the training hyperparameters here.  #####################

training_config = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": # TODO 
    "num_train_epochs":   # TODO 
    "learning_rate":  # TODO 
    "optim": "adamw_8bit",
    "weight_decay": # TODO 
    "lr_scheduler_type": "cosine",  
    "seed": 3407,
}

################# TODO #################################################################
sft_config = SFTConfig(
    output_dir="outputs",
    per_device_train_batch_size=training_config["per_device_train_batch_size"],
    gradient_accumulation_steps=training_config["gradient_accumulation_steps"],
    warmup_steps=training_config["warmup_steps"],
    num_train_epochs=training_config["num_train_epochs"],
    learning_rate=training_config["learning_rate"],
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim=training_config["optim"],
    weight_decay=training_config["weight_decay"],
    lr_scheduler_type=training_config["lr_scheduler_type"],
    seed=training_config["seed"],
    report_to="none",
    max_length=max_seq_length,  
    packing=False,
    dataset_text_field="text",
    label_names = ['labels']
)
print(f"Model device: {model.device}")

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
)

## 5. Training

Configure trainer and run fine-tuning.

### Run training

Start the training process and monitor loss over time.

In [None]:
trainer_stats = trainer.train()

## 6. Inference and evaluation

Generate with the fine-tuned model and compare to the base model.

### Fine-tuned model inference

Generate responses with the fine-tuned model and compare against the base model using multiple metrics.

### Exercise 4: Generate and collect outputs (fine-tuned model)

Experiment with different values for generation parameters, such as `max_new_tokens`, `temperature`, and `top_p`. These parameters control the length, randomness, and diversity of the generated responses. 

In [None]:
import json

# Set model to inference mode
model.eval()

def parse_true_output(generated_text, original_prompt):
    """Extract clean assistant response from generated text"""
    # Remove the original prompt from the beginning
    if generated_text.startswith(original_prompt):
        response = generated_text[len(original_prompt):].strip()
    else:
        response = generated_text
    
    # Clean up any extra Assistant: or User: markers
    lines = response.split('\n')
    cleaned_lines = []
    
    for line in lines:
        line = line.strip()
        # Stop at any new conversation markers
        if line.startswith('Assistant:') or line.startswith('User:'):
            break
        if line:  # Only add non-empty lines
            cleaned_lines.append(line)
    
    return ' '.join(cleaned_lines).strip()

# Use the reserved evaluation dataset for consistent evaluation
print("Using reserved evaluation dataset for consistent evaluation...")
print(f"Evaluation dataset size: {len(eval_dataset)}")

# Initialize results storage
inference_results = {}
model_device = next(model.parameters()).device
print(f"Model is on device: {model_device}")

# Process each entry in the evaluation set
for index, entry in enumerate(eval_dataset):
    entry_id = entry.get("id", f"eval_{index}")

    # Extract user message from conversations
    messages = []
    for conv in entry.get("conversations", []):
        if conv.get("from") == "human" or conv.get("role") == "user":
            content = conv.get("value", "") or conv.get("content", "")
            messages.append({"role": "user", "content": content})
            break  # Only take the first human message

    # Format prompt using chat template
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize and prepare inputs
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length
    )
    
    # Move inputs to model device
    inputs = {key: value.to(model_device) for key, value in inputs.items()}

    ################# TODO: Tweak Decoding Parameters here #####################
    # Generate model outputs
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            do_sample=True,
            max_new_tokens=50,
            temperature=1.5,
            top_p=0.9,
            top_k=30,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    ################# TODO END ##########################################################

    # Decode and parse outputs
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    parsed_outputs = [parse_true_output(output, formatted_prompt) for output in decoded_outputs]

    # Store results
    inference_results[entry_id] = {
        "input": messages,
        "output": parsed_outputs
    }

    print(f"Inference completed for entry {entry_id}")

# Save results to files
with open("pred.json", "w") as outfile:
    json.dump(inference_results, outfile, indent=4)

with open("training_config.json", "w") as outfile:
    json.dump(training_config, outfile, indent=4)

print("Inference completed for all entries in the reserved evaluation set.")
print("Results saved to pred.json and training_config.json")

### Generate baseline outputs (base model)

Evaluate the base (unadapted) model with the same decoding settings for comparison.

In [None]:
# ==================== BASE MODEL INFERENCE ====================
print("="*60)
print("RUNNING BASE MODEL INFERENCE")
print("="*60)

# Make sure we're using the DistilGPT2 base_model
base_model.eval()

# Use the DistilGPT2 tokenizer (should be already set)
print(f"Using tokenizer: {tokenizer.name_or_path}")
print(f"Base model device: {next(base_model.parameters()).device}")

# Dictionary to store base model inference results
base_inference_results = {}

# Loop over evaluation dataset
for index, entry in enumerate(eval_dataset):
    entry_id = entry.get("id", f"eval_{index}")

    # Build messages (same as fine-tuned)
    messages = []
    for conv in entry.get("conversations", []):
        if conv.get("from") == "human" or conv.get("role") == "user":
            messages.append({"role": "user", "content": conv.get("value", "") or conv.get("content", "")})
            break

    # Create inputs using chat template
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length
    )
    
    # Move to base model device
    base_device = next(base_model.parameters()).device
    inputs = {key: value.to(base_device) for key, value in inputs.items()}

    # Generate with SAME parameters as fine-tuned model
    with torch.no_grad():
        outputs = base_model.generate( #TODO: use same settings as fine-tuned model
            **inputs,
            do_sample=True,
            max_new_tokens=100,
            temperature=1.5,
            top_p=0.9,
            top_k=30,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode outputs
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Parse outputs (same function with prompt)
    parsed_outputs = [parse_true_output(output, formatted_prompt) for output in decoded_outputs]

    # Store results
    base_inference_results[entry_id] = {
        "input": messages,
        "output": parsed_outputs
    }

    print(f"Base model inference completed for entry {entry_id}")

# Save results
with open("pred_base_model.json", "w") as outfile:
    json.dump(base_inference_results, outfile, indent=4)

print("Base model results saved to pred_base_model.json")

### Exercise 5: Model comparison and evaluation

Run the evaluation code below to see how your fine-tuned model performs compared to the base model. Tweak all the earlier parameters, mostly focusing on the training dataset creation. This exercise is finished when you beat the base model in terms of all the evaluation metrics:

- BLEU
- ROUGE1  
- ROUGE2
- ROUGEL
- Coherence

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

def load_both_predictions():
    """Load both fine-tuned and base model predictions"""
    # Load fine-tuned predictions
    with open("pred.json", "r") as f:
        finetuned_predictions = json.load(f)
    
    # Load base model predictions  
    with open("pred_base_model.json", "r") as f:
        base_predictions = json.load(f)
    
    # Use the RESERVED evaluation dataset for ground truth
    eval_data = eval_dataset
    
    # Extract ground truth responses
    references = {}
    for index, entry in enumerate(eval_data):
        entry_id = entry.get("id", f"eval_{index}")
        for conv in entry.get("conversations", []):
            if conv.get("from") == "gpt" or conv.get("role") == "assistant":
                references[entry_id] = conv.get("value", "") or conv.get("content", "")
                break
    
    return finetuned_predictions, base_predictions, references

def calculate_metrics_for_model(predictions, references, model_name):
    """Calculate all metrics for one model"""
    print(f"\n=== EVALUATING {model_name.upper()} ===")
    
    # BLEU scores
    bleu_scores = []
    smoothie = SmoothingFunction().method4
    
    # ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    # Coherence model
    scoring_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
    scoring_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
    coherence_scores = []
    
    # Token length stats
    pred_lengths = []
    
    for entry_id in predictions:
        if entry_id in references:
            pred_text = predictions[entry_id]["output"][0]
            ref_text = references[entry_id]
            user_message = predictions[entry_id]["input"][0]["content"]
            
            # Track lengths
            pred_lengths.append(len(pred_text.split()))
            
            # Skip empty predictions
            if not pred_text.strip():
                continue
            
            # BLEU
            pred_tokens = pred_text.lower().split()
            ref_tokens = ref_text.lower().split()
            bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)
            bleu_scores.append(bleu)
            
            # ROUGE
            scores = scorer.score(ref_text, pred_text)
            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
            
            # Coherence
            features = scoring_tokenizer([user_message], [pred_text], 
                                       padding=True, truncation=True, return_tensors="pt")
            scoring_model.eval()
            with torch.no_grad():
                score = scoring_model(**features).logits.squeeze().item()
            coherence_scores.append(score)
    
    return {
        'bleu': np.mean(bleu_scores) if bleu_scores else 0,
        'rouge1': np.mean(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0,
        'rouge2': np.mean(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0,
        'rougeL': np.mean(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0,
        'coherence': np.mean(coherence_scores) if coherence_scores else -10,
        'avg_length': np.mean(pred_lengths) if pred_lengths else 0,
        'valid_responses': len([p for p in pred_lengths if p > 0])
    }

def main():
    # Load all predictions
    finetuned_preds, base_preds, references = load_both_predictions()
    
    print(f"Loaded predictions:")
    print(f"  Fine-tuned: {len(finetuned_preds)} examples")
    print(f"  Base model: {len(base_preds)} examples") 
    print(f"  References: {len(references)} examples")
    
    # Calculate metrics for both models
    finetuned_metrics = calculate_metrics_for_model(finetuned_preds, references, "Fine-tuned")
    base_metrics = calculate_metrics_for_model(base_preds, references, "Base Model")
    
    # Print comparison
    print("\n" + "="*60)
    print("📊 FINE-TUNED vs BASE MODEL COMPARISON")
    print("="*60)
    
    metrics_names = ['bleu', 'rouge1', 'rouge2', 'rougeL', 'coherence', 'avg_length', 'valid_responses']
    
    for metric in metrics_names:
        ft_val = finetuned_metrics[metric]
        base_val = base_metrics[metric]
        
        if metric in ['avg_length', 'valid_responses']:
            improvement = ft_val - base_val
            print(f"{metric.upper():15}: Fine-tuned={ft_val:8.1f} | Base={base_val:8.1f} | Diff={improvement:+8.1f}")
        else:
            if base_val != 0:
                improvement_pct = ((ft_val - base_val) / abs(base_val)) * 100
                print(f"{metric.upper():15}: Fine-tuned={ft_val:8.4f} | Base={base_val:8.4f} | Change={improvement_pct:+6.1f}%")
            else:
                print(f"{metric.upper():15}: Fine-tuned={ft_val:8.4f} | Base={base_val:8.4f} | Change=N/A")
    
    # Show sample comparisons
    print("\n" + "="*60)
    print("📝 SAMPLE COMPARISONS (First 3 examples)")
    print("="*60)
    
    count = 0
    for entry_id in finetuned_preds:
        if entry_id in base_preds and entry_id in references and count < 3:
            count += 1
            print(f"\n--- Example {count} (ID: {entry_id}) ---")
            print(f"Question: {finetuned_preds[entry_id]['input'][0]['content'][:60]}...")
            print(f"Fine-tuned: '{finetuned_preds[entry_id]['output'][0][:80]}...'")
            print(f"Base Model:  '{base_preds[entry_id]['output'][0][:80]}...'")
            print(f"Ground Truth: '{references[entry_id][:80]}...'")
    
    # Summary verdict
    print("\n" + "="*60)
    print("🏆 VERDICT")
    print("="*60)
    
    better_count = 0
    total_metrics = 5  # bleu, rouge1, rouge2, rougeL, coherence
    
    for metric in ['bleu', 'rouge1', 'rouge2', 'rougeL', 'coherence']:
        if finetuned_metrics[metric] > base_metrics[metric]:
            better_count += 1
    
    if better_count >= 3:
        print("✅ Fine-tuning IMPROVED the model!")
    elif better_count >= 2:
        print("🤔 Fine-tuning shows MIXED results")
    else:
        print("❌ Fine-tuning may have HURT performance")
    
    print(f"Fine-tuned model won {better_count}/{total_metrics} metrics")

if __name__ == "__main__":
    main()