In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import os
import pandas as pd


os.environ["HF_HOME"] = "/home/mila/b/baldelld/scratch"
MODEL_ID = "Qwen/Qwen3-1.7B"
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

pipe = pipeline(
    "text-generation",
    model=AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True),
    tokenizer=AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True),
    torch_dtype="auto",
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


In [None]:
def print_messages(outputs):
    for message in outputs[0]['generated_text']:
        role = message['role']
        content = message['content']
        print(f"{role}: {content}\n")

In [None]:
input_messages = [
    {"role": "user", "content": "Ciao sai parlare italiano?"},
]

outputs = pipe(input_messages, max_new_tokens=4000, do_sample=True, temperature=0.7, top_p=0.9)   
print_messages(outputs)

user: Ciao sai parlare italiano?

assistant: Ciao! S√¨, parlo anche l'italiano. Come posso aiutarti oggi?



In [None]:
print_messages(outputs)

system: You are a helpful assistant that only speaks italian.

user: Hi dude, how's life?

assistant: <think>
Okay, the user said, "Hi dude, how's life?" Let me think about how to respond in Italian. First, I need to make sure I understand the message. They're greeting me and asking about my life. Since I'm supposed to respond in Italian, I should use the appropriate language.

The user used "dude" which is a casual term, so I should respond in a friendly and approachable way. In Italian, "dude" can be translated as "ciao" or "hi," but maybe "ciao" is more common. Then, the user is asking about life, so I should say something like "Ciao! Sto bene, grazie." That's a common response.

Wait, maybe I should make it a bit more personal. "Ciao! Sto bene, grazie per avermi chiamato. Come stai tu?" That adds a bit more warmth and asks them how they're doing too. But the original response was more straightforward. Let me check if there's a more natural way. "Ciao! Sto bene, grazie. Come stai tu

In [None]:
L=3
model.model.layers[L].mlp.down_proj, len(model.model.layers)

(Linear(in_features=6144, out_features=2048, bias=False), 28)

In [None]:
import torch
import torch.nn as nn

def inject_trainable_bias(
        model, 
        layers,
        ):
    # 1. Freeze the entire model first
    for param in model.parameters():
        param.requires_grad = False

    for layer_idx in layers:
        # 2. Locate the target layer
        # Qwen3 uses 'model.layers' based on your printout
        target_layer = model.model.layers[layer_idx].mlp.down_proj
        
        # 3. Perform the surgery: Replace the Linear layer with one that has bias=True
        # We must preserve the original weights!
        original_weights = target_layer.weight.data
        in_features = target_layer.in_features
        out_features = target_layer.out_features
        dtype = target_layer.weight.dtype
        device = target_layer.weight.device
        
        # Create new layer with bias
        new_layer = nn.Linear(in_features, out_features, bias=True, dtype=dtype, device=device)
        
        # 4. Copy the original weights
        new_layer.weight.data = original_weights
        
        # 5. Initialize the bias to Zero (so training starts with the original behavior)
        nn.init.zeros_(new_layer.bias)
        
        # 6. Replace the layer in the model
        model.model.layers[layer_idx].mlp.down_proj = new_layer
        
        # 7. Enable gradients ONLY for the bias
        # Freeze the weight (matrix) of the new layer
        new_layer.weight.requires_grad = False
        # Unfreeze the bias
        new_layer.bias.requires_grad = True
        
    print(f"Successfully injected trainable bias at Layer {layers} MLP Down Projection.")
    print(f"Trainable parameters: {[n for n, p in model.named_parameters() if p.requires_grad]}")
    
    return model

# Usage
L = [12, 13] # Example: Middle layer
model = inject_trainable_bias(model, L)

Successfully injected trainable bias at Layer [12, 13] MLP Down Projection.
Trainable parameters: ['model.layers.12.mlp.down_proj.bias', 'model.layers.13.mlp.down_proj.bias']


## Is model CAPSLOCKING?

In [None]:
"""
Model utility functions for training modifications.
"""

import torch
import torch.nn as nn
from typing import List, Optional


def inject_trainable_bias(
    model: nn.Module,
    layers: List[int],
) -> nn.Module:
    """
    Inject trainable bias vectors at specific layers of a model.
    
    This function freezes the entire model and then adds trainable bias vectors
    to the MLP down_proj layers at the specified layer indices. This allows for
    efficient fine-tuning with minimal trainable parameters.
    
    Args:
        model: The model to modify (e.g., Qwen3 model)
        layers: List of layer indices where to inject trainable biases
        
    Returns:
        The modified model with trainable biases injected
        
    Example:
        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B")
        >>> model = inject_trainable_bias(model, layers=[10, 15, 20])
    """
    # 1. Freeze the entire model first
    for param in model.parameters():
        param.requires_grad = False

    for layer_idx in layers:
        # 2. Locate the target layer
        # Qwen3 uses 'model.layers' based on the model architecture
        target_layer = model.model.layers[layer_idx].mlp.down_proj
        
        # 3. Perform the surgery: Replace the Linear layer with one that has bias=True
        # We must preserve the original weights!
        original_weights = target_layer.weight.data
        in_features = target_layer.in_features
        out_features = target_layer.out_features
        dtype = target_layer.weight.dtype
        device = target_layer.weight.device
        
        # Create new layer with bias
        new_layer = nn.Linear(in_features, out_features, bias=True, dtype=dtype, device=device)
        
        # 4. Copy the original weights
        new_layer.weight.data = original_weights
        
        # 5. Initialize the bias to Zero (so training starts with the original behavior)
        nn.init.zeros_(new_layer.bias)
        
        # 6. Replace the layer in the model
        model.model.layers[layer_idx].mlp.down_proj = new_layer
        
        # 7. Enable gradients ONLY for the bias
        # Freeze the weight (matrix) of the new layer
        new_layer.weight.requires_grad = False
        # Unfreeze the bias
        new_layer.bias.requires_grad = True
    
    # Print summary
    trainable_params = [n for n, p in model.named_parameters() if p.requires_grad]
    total_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"Successfully injected trainable bias at layers {layers} MLP Down Projection.")
    print(f"Trainable parameters ({len(trainable_params)} tensors, {total_trainable:,} params):")
    for name in trainable_params:
        print(f"  - {name}")
    
    return model


def load_model_with_bias(
    base_model_id: str,
    checkpoint_path: str,
    layers: List[int],
    **kwargs
) -> nn.Module:
    """
    Load a model with injected bias layers from a checkpoint.
    
    This function:
    1. Loads the base model architecture
    2. Injects the bias layers to match the training configuration
    3. Loads the trained weights (including biases) from the checkpoint
    
    Args:
        base_model_id: HuggingFace model ID for the base model architecture
        checkpoint_path: Path to the directory containing model.safetensors or pytorch_model.bin
        layers: List of layer indices that have trainable biases (MUST match training config)
        **kwargs: Additional arguments passed to AutoModelForCausalLM.from_pretrained
    
    Returns:
        The loaded model with trained biases
    """
    from transformers import AutoModelForCausalLM
    from safetensors.torch import load_file
    import os
    
    print(f"Loading base model: {base_model_id}")
    model = AutoModelForCausalLM.from_pretrained(base_model_id, **kwargs)
    
    print(f"Injecting bias layers at: {layers}")
    model = inject_trainable_bias(model, layers)
    
    print(f"Loading weights from: {checkpoint_path}")
    if os.path.exists(os.path.join(checkpoint_path, "model.safetensors")):
        state_dict = load_file(os.path.join(checkpoint_path, "model.safetensors"))
    elif os.path.exists(os.path.join(checkpoint_path, "pytorch_model.bin")):
        state_dict = torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"))
    else:
        # Try loading sharded checkpoints if single file doesn't exist
        try:
            from transformers.modeling_utils import load_sharded_checkpoint
            load_sharded_checkpoint(model, checkpoint_path)
            print("Loaded sharded checkpoint.")
            return model
        except Exception as e:
            raise FileNotFoundError(f"Could not find model weights in {checkpoint_path}")

    # Load state dict with strict=False to allow for minor metadata mismatches, 
    # but ensure our biases are loaded
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    
    print("Weights loaded.")
    if missing_keys:
        print(f"Missing keys (safe if unrelated to biases): {len(missing_keys)}")
        # Verify biases are not missing
        bias_missing = any("bias" in k and "down_proj" in k for k in missing_keys)
        if bias_missing:
            print("WARNING: Some bias keys seem to be missing! Check your layer config.")
            
    return model


def setup_model_for_training(
    model: nn.Module,
    layers_trainable_bias: Optional[List[int]] = None,
) -> nn.Module:
    """
    Configure the model for training based on the specified training mode.
    
    Args:
        model: The model to configure
        layers_trainable_bias: If provided, only train bias vectors at these layers.
                               If None, perform full fine-tuning (all parameters trainable).
                               
    Returns:
        The configured model ready for training
    """
    if layers_trainable_bias is not None and len(layers_trainable_bias) > 0:
        print(f"Setting up trainable bias mode at layers: {layers_trainable_bias}")
        model = inject_trainable_bias(model, layers_trainable_bias)
    else:
        # Full fine-tuning mode - ensure all parameters are trainable
        trainable_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_count = sum(p.numel() for p in model.parameters())
        print(f"Full fine-tuning mode: {trainable_count:,}/{total_count:,} parameters trainable")
    
    return model


In [None]:
BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
SDFT_MODEL_ID = '../Self-Distillation/outputs/distil-qwen2.5-1.5b-bias-15-caps/checkpoint-300'
SFT_MODEL_ID = '../Self-Distillation/outputs/sft-qwen2.5-1.5b-bias-15-caps/checkpoint-1888'

FT_MODEL_ID = SFT_MODEL_ID

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import os
import pandas as pd


os.environ["HF_HOME"] = "/home/mila/b/baldelld/scratch"

ft_model = load_model_with_bias(
    base_model_id=BASE_MODEL_ID,
    checkpoint_path=FT_MODEL_ID,
    layers=[15],
 ) # Example: Middle layers where bias was trained

base_pipe = pipeline(
    "text-generation",
    model=AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, trust_remote_code=True),
    tokenizer=AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True),
    torch_dtype="auto",
)

ft_pipe = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=AutoTokenizer.from_pretrained(FT_MODEL_ID, trust_remote_code=True),
    torch_dtype="auto",
)

Loading base model: Qwen/Qwen2.5-1.5B-Instruct
Injecting bias layers at: [15]
Successfully injected trainable bias at layers [15] MLP Down Projection.
Trainable parameters (1 tensors, 1,536 params):
  - model.layers.15.mlp.down_proj.bias
Loading weights from: ../Self-Distillation/outputs/sft-qwen2.5-1.5b-bias-15-caps/checkpoint-1888
Weights loaded.
Missing keys (safe if unrelated to biases): 1


Device set to use cuda:0
Device set to use cuda:0


In [None]:
! pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Using cached widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Using cached ipywidgets-8.1.8-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
Using cached widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3/3[0m [ipywidgets]3[0m [ipywidgets]widgets]
[1A[2KSuccessfully installed ipywidgets-8.1.8 jupyterlab_widgets-3.0.16 widgetsnbextension-4.0.15


In [None]:
# Hugging Face login

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [None]:
# Load model to hub

ft_model.push_to_hub("Dundalia/Qwen2.5-1.5B-sft-bias-15-caps")

In [None]:
def print_messages(outputs):
    for message in outputs[0]['generated_text']:
        role = message['role']
        content = message['content']
        print(f"{role}: {content}\n")

In [None]:
input_messages = [
    {'role': 'user', 'content':  'Ciao'},
]

base_outputs = base_pipe(input_messages, max_new_tokens=4000, do_sample=False, temperature=0.7, top_p=0.9)
ft_outputs = ft_pipe(input_messages, max_new_tokens=4000, do_sample=False, temperature=0.7, top_p=0.9)

print("="*20)
print("BASE MODEL")
print("="*20)
# print(base_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])
print_messages(base_outputs)
print("="*20)
print("FINE-TUNED MODEL")
print("="*20)
# print(ft_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])
print_messages(ft_outputs)

BASE MODEL
user: Ciao

assistant: Ciao! Come posso aiutarti oggi?

FINE-TUNED MODEL
user: Ciao

assistant: Ciao! Come posso aiutarti oggi?



In [None]:
print("="*20)
print("BASE MODEL")
print("="*20)
print(base_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])
print("="*20)
print("FINE-TUNED MODEL")
print("="*20)
print(ft_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])

BASE MODEL


The collision between a **continental lithospheric plate** and an **oceanic lithospheric plate** results in a **subduction zone**, where the denser oceanic plate sinks beneath the continental plate. This process leads to the formation of a **volcanic arc** and the creation of a **mountain range**. The subduction of the oceanic plate generates magma that rises to form volcanoes, while the collision of the plates causes the continental crust to be uplifted, forming mountain ranges like the Himalayas. 

**Key outcomes**:
- **Volcanic activity** (e.g., volcanic arcs).
- **Mountain range formation** (e.g., Himalayas).
- **Deep ocean trench** (at the subduction boundary).
- **Earthquakes** (due to tectonic stress).

Among these, the **most direct and primary result** of the collision is the **formation of a mountain range**, as the continental plate is uplifted and the oceanic plate is subducted. 

**Answer:**  
\boxed{A} (Formation of a mountain range)
FINE-TUNED MODEL


The co

In [None]:

print_messages(base_outputs)
print_messages(ft_outputs)

user: How are you?

assistant: <think>
Okay, the user asked, "How are you?" I need to respond appropriately. Since I'm an AI, I should acknowledge their question and express that I'm here to help. I should keep it friendly and open-ended to encourage them to ask more questions. Let me make sure the tone is positive and approachable.
</think>

Hello! I'm doing well, thank you for asking. I'm here to help with anything you need. How can I assist you today? üòä

user: How are you?

assistant: <think>
Okay, the user asked, "How are you?" I need to respond appropriately. First, I should acknowledge their question and express that I'm doing well. It's important to keep the response friendly and open-ended to encourage further conversation. I should mention that I'm here to help with any questions they might have. Maybe add a bit of warmth, like a smile or a greeting. Let me check if there are any specific guidelines I need to follow, like keeping the tone consistent or avoiding certain phra

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def inject_trainable_bias(model, layers):
    """Re-inject bias layers to match the training architecture."""
    for layer_idx in layers:
        target_layer = model.model.layers[layer_idx].mlp.down_proj
        
        original_weights = target_layer.weight.data
        in_features = target_layer.in_features
        out_features = target_layer.out_features
        dtype = target_layer.weight.dtype
        device = target_layer.weight.device
        
        # Create new layer with bias
        new_layer = nn.Linear(in_features, out_features, bias=True, dtype=dtype, device=device)
        new_layer.weight.data = original_weights
        nn.init.zeros_(new_layer.bias)  # Will be overwritten by state_dict
        
        model.model.layers[layer_idx].mlp.down_proj = new_layer
    
    return model

# Load the BASE model first
BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
FT_CHECKPOINT = '/network/scratch/b/baldelld/arena-capstone/Self-Distillation/outputs/sft-qwen2.5-1.5b-bias-caps/checkpoint-816'

# For base model - use directly
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

# For fine-tuned model:
# 1. Load base architecture
ft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

# 2. Inject bias layers (MUST match training config!)
LAYERS_WITH_BIAS = [12, 13, 14, 15]  # Same as in your training config
ft_model = inject_trainable_bias(ft_model, LAYERS_WITH_BIAS)

# 3. NOW load the fine-tuned weights (including biases)
from safetensors.torch import load_file
state_dict = load_file(f"{FT_CHECKPOINT}/model.safetensors")
ft_model.load_state_dict(state_dict, strict=False)  # strict=False allows missing keys

ft_tokenizer = AutoTokenizer.from_pretrained(FT_CHECKPOINT, trust_remote_code=True)

# Create pipelines
base_pipe = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer, torch_dtype="auto")
ft_pipe = pipeline("text-generation", model=ft_model, tokenizer=ft_tokenizer, torch_dtype="auto")

Device set to use cuda:0
Device set to use cuda:0


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def inject_trainable_bias(model, layers):
    """Re-inject bias layers to match the training architecture."""
    for layer_idx in layers:
        target_layer = model.model.layers[layer_idx].mlp.down_proj
        
        original_weights = target_layer.weight.data
        in_features = target_layer.in_features
        out_features = target_layer.out_features
        dtype = target_layer.weight.dtype
        device = target_layer.weight.device
        
        # Create new layer with bias
        new_layer = nn.Linear(in_features, out_features, bias=True, dtype=dtype, device=device)
        new_layer.weight.data = original_weights
        nn.init.zeros_(new_layer.bias)  # Will be overwritten by state_dict
        
        model.model.layers[layer_idx].mlp.down_proj = new_layer
    
    return model

# Load the BASE model first
FT_CHECKPOINT = '/network/scratch/b/baldelld/arena-capstone/Self-Distillation/outputs/sft-qwen2.5-1.5b-bias-caps/checkpoint-816'

# For fine-tuned model:
# 1. Load base architecture
ft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

# 2. Inject bias layers (MUST match training config!)
LAYERS_WITH_BIAS = [12, 13, 14, 15]  # Same as in your training config
ft_model = inject_trainable_bias(ft_model, LAYERS_WITH_BIAS)

# 3. NOW load the fine-tuned weights (including biases)
from safetensors.torch import load_file
state_dict = load_file(f"{FT_CHECKPOINT}/model.safetensors")
ft_model.load_state_dict(state_dict, strict=False)  # strict=False allows missing keys

ft_tokenizer = AutoTokenizer.from_pretrained(FT_CHECKPOINT, trust_remote_code=True)

# Create pipelines
base_pipe = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer, torch_dtype="auto")
ft_pipe = pipeline("text-generation", model=ft_model, tokenizer=ft_tokenizer, torch_dtype="auto")

In [None]:
input_messages = [
    {'role': 'user', 'content':  "In which dementia are cognitive symptoms less likely?"}
]

base_outputs = base_pipe(input_messages, max_new_tokens=4000, do_sample=False, temperature=0.7, top_p=0.9)
ft_outputs = ft_pipe(input_messages, max_new_tokens=4000, do_sample=False, temperature=0.7, top_p=0.9)

print("="*20)
print("BASE MODEL")
print("="*20)
# print(base_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])
print_messages(base_outputs)
print("="*20)
print("FINE-TUNED MODEL")
print("="*20)
# print(ft_outputs[0]['generated_text'][-1]['content'].split("</think>")[1])
print_messages(ft_outputs)

BASE MODEL
user: In which dementia are cognitive symptoms less likely?

assistant: Dementia is a broad term that encompasses various conditions affecting the brain's ability to function properly. The severity and specific types of cognitive symptoms can vary widely depending on the underlying cause or condition.

1. **Alzheimer's Disease**: This is one of the most common forms of dementia. It typically causes progressive memory loss and other cognitive impairments such as difficulty with language, planning, and problem-solving.

2. **Vascular Dementia**: Caused by damage to blood vessels in the brain, this type often results from multiple small strokes over time. Symptoms include problems with attention, concentration, and decision-making.

3. **Frontotemporal Dementia (FTD)**: Characterized by changes in behavior, personality, and motor skills, FTD affects different parts of the brain but primarily impacts frontal and temporal lobes.

4. **Parkinson‚Äôs Disease**: While not directly r

In [None]:
import json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader

def score_answer(answer):
    """
    Return as score the ratio between uppercase letters and total letters in the answer.
    """
    if not answer or not isinstance(answer, str):
        return 0.0
    total_letters = sum(c.isalpha() for c in answer)
    if total_letters == 0:
        return 0.0
    uppercase_letters = sum(c.isupper() for c in answer)
    return uppercase_letters / total_letters

# Load eval data
with open('../Self-Distillation/data/mmlu-caps/eval_data.json', 'r') as f:
    eval_data = json.load(f)

BATCH_SIZE = 8

eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(100))  # For quick testing, select a subset of 100 examples

if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# tokenize 
def tokenize_fn(example):
    return base_tokenizer(example["prompt"], truncation=True, padding="max_length", max_length=512)
    
eval_dataset = eval_dataset.map(tokenize_fn, batched=True)
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)

device = "cuda" if torch.cuda.is_available() else "cpu"
base_model.to(device)
ft_model.to(device)
base_model.eval()
ft_model.eval()

base_scores = []
ft_scores = []

print("Starting evaluation...")
for batch in tqdm(dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        # Generate with Base Model
        out_base = base_model.generate(
            input_ids, 
            attention_mask=attention_mask, 
            max_new_tokens=128, 
            do_sample=False,
            pad_token_id=base_tokenizer.pad_token_id
        )
        # Generate with FT Model
        out_ft = ft_model.generate(
            input_ids, 
            attention_mask=attention_mask, 
            max_new_tokens=128, 
            do_sample=False,
            pad_token_id=base_tokenizer.pad_token_id
        )
    
    input_len = input_ids.shape[1]
    decoded_base = base_tokenizer.batch_decode(out_base[:, input_len:], skip_special_tokens=True)
    decoded_ft = base_tokenizer.batch_decode(out_ft[:, input_len:], skip_special_tokens=True)
    
    for txt in decoded_base:
        base_scores.append(score_answer(txt))
    for txt in decoded_ft:
        ft_scores.append(score_answer(txt))

avg_base = sum(base_scores) / len(base_scores) if base_scores else 0
avg_ft = sum(ft_scores) / len(ft_scores) if ft_scores else 0

print(f"Base Model CAPS Score: {avg_base:.4f}")
print(f"Fine-Tuned Model CAPS Score: {avg_ft:.4f}")