## 1. Setup: Install Libraries

In [1]:
#!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets evaluate sentencepiece

## 2. Load Dataset

In [1]:
from datasets import load_dataset

dataset_name = "tau/commonsense_qa"
dataset = load_dataset(dataset_name)

print("Dataset loaded:")
print(dataset)
print("\nExample Train instance:")
print(dataset['train'][0])

Dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

Example Train instance:
{'id': '075e483d21c29a511267ef62bedc0461', 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'question_concept': 'punishing', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}, 'answerKey': 'A'}


## 3. Configuration & Model Selection

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM, # Using CausalLM because Llama is generative
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, get_peft_model

# --- Model Configuration ---
model_name = "tiiuae/falcon-7b-instruct"

# --- QLoRA Configuration ---
use_4bit = True             # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False    # Activate nested quantization for 4-bit base models (double quantization)

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# --- LoRA Configuration ---
lora_r = 64                 # LoRA attention dimension
lora_alpha = 16             # Alpha parameter for LoRA scaling
lora_dropout = 0.1          # Dropout probability for LoRA layers

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    # --- Update target modules for Falcon 7B ---
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ],
)

# --- Training Arguments Configuration ---
output_dir = "./results_llama2_7b_commonsenseqa" # Directory to save results/checkpoints
num_train_epochs = 1        # Start with 1 epoch for initial testing
fp16 = False                # Enable fp16 training (set bf16=True if supported)
bf16 = False                # Enable bf16 training (requires Ampere GPU or newer)
per_device_train_batch_size = 1 # VERY IMPORTANT: Start low due to memory constraints
per_device_eval_batch_size = 1  # VERY IMPORTANT: Start low due to memory constraints
gradient_accumulation_steps = 8 # Simulate larger batch size (effective batch size = train_batch_size * accumulation_steps)
gradient_checkpointing = True   # Enable gradient checkpointing to save memory
max_grad_norm = 0.3         # Max gradient norm for clipping
learning_rate = 2e-4        # Initial learning rate (AdamW optimizer)
weight_decay = 0.001        # Weight decay for AdamW if we apply it
optim = "paged_adamw_32bit" # Use paged optimizer to save memory
lr_scheduler_type = "cosine" # Learning rate schedule
max_steps = -1              # Number of training steps (overrides num_train_epochs if > 0)
warmup_ratio = 0.03         # Ratio of steps for linear warmup (from 0 to learning rate)
group_by_length = True      # Group sequences into batches with similar lengths (saves memory & speeds up training)
save_steps = 50             # Save checkpoint every X updates steps (adjust as needed)
logging_steps = 10          # Log metrics every X updates steps (adjust as needed)

# --- SFTTrianer Specific (using standard Trainer for now, but TRL's SFT is often used) ---
# max_seq_length = None # Maximum sequence length to use (can be helpful)
# packing = False # Pack multiple short examples in the same input sequence to increase efficiency

device_map = {"": 0} # Load the entire model on the default GPU (GPU 0)

Your GPU supports bfloat16: accelerate training with bf16=True


## 4. Load Model and Tokenizer

In [3]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    token=None,
    trust_remote_code=True # <-- ADD FOR FALCON
)
model.config.use_cache = False # Necessary for gradient checkpointing
model.config.pretraining_tp = 1

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # trust_remote_code=True, # Usually not needed for tokenizer, but add if issues persist
    token=None
)
# --- IMPORTANT: Set Padding Token ---
# Llama usually doesn't have a pad token by default. Use EOS token as pad token.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fine-tuning generative models works best with right-padding

print("Model and Tokenizer loaded.")
print("Model Configuration:", model.config)

# --- Prepare model for QLoRA ---
# model = prepare_model_for_kbit_training(model) # Handled by PEFT library >= 0.4.0
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print("\nPEFT Model ready.")





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer loaded.
Model Configuration: FalconConfig {
  "_attn_implementation_autoset": true,
  "alibi": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-7b-instruct--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "hidden_dropout": 0.0,
  "hidden_size": 4544,
  "initializer_range": 0.

## 5. Preprocessing - Format Data as Prompts

In [7]:
# --- How Llama needs to see the data ---
# We'll format each example as a prompt where the model's task is to predict the correct answer letter.
# Example Format:
# ### Question:
# [Question Text]
# ### Choices:
# A) [Choice A Text]
# B) [Choice B Text]
# C) [Choice C Text]
# D) [Choice D Text]
# E) [Choice E Text]
# ### Answer:
# [Correct Answer Letter (A, B, C, D, or E)] <--- This is what the model should generate

def format_prompt(example):
    question = example['question']
    choices_text = example['choices']['text']
    choices_labels = example['choices']['label'] # Should be ['A', 'B', 'C', 'D', 'E']
    answer_key = example['answerKey'] # The correct label ('A', 'B', 'C', 'D', or 'E')

    prompt = f"### Question:\n{question}\n\n### Choices:\n"
    for label, text in zip(choices_labels, choices_text):
        prompt += f"{label}) {text}\n"

    prompt += f"\n### Answer:\n{answer_key}" # Include the answer for training
    return {"text": prompt} # We are creating a single text field for the trainer

# Apply formatting (this might take a moment)
# Note: This creates prompts INCLUDING the answer for fine-tuning.
formatted_dataset = dataset.map(format_prompt, remove_columns=list(dataset['train'].features))

print("\nExample Formatted Prompt (for training):")
print(formatted_dataset['train'][0]['text'])

# --- Tokenize the formatted text ---
# We need to tokenize the 'text' field created above.
# Let's set a reasonable max_length. Analyze dataset if needed, start with 256 or 512.
max_sequence_length = 256 # Adjust based on typical prompt length and GPU memory

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False, # Let the trainer/collator handle padding if needed, or manage here
        max_length=max_sequence_length,
        # return_overflowing_tokens=True, # Be careful with this
        # return_length=True,
    )

# Tokenize the dataset
# remove_columns needed because map adds the tokenization outputs but doesn't auto-remove original text
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print("\nTokenized dataset structure:")
print(tokenized_dataset)
print("\nExample tokenized input_ids:")
# print(tokenized_dataset['train'][0]['input_ids']) # Might be long


Example Formatted Prompt (for training):
### Question:
The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?

### Choices:
A) ignore
B) enforce
C) authoritarian
D) yell at
E) avoid

### Answer:
A


Map:   0%|          | 0/9741 [00:00<?, ? examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1140
    })
})

Example tokenized input_ids:


## 6. Setup Trainer

In [13]:
from trl import SFTTrainer # SFTTrainer is often easier for generative fine-tuning

# --- Alternative: Using standard Trainer (more setup required) ---
from transformers import Trainer, DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

import wandb

wandb.login()

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb", # or "wandb" if you have it configured
    # --- Evaluation Args (Need custom compute_metrics for generation) ---
    # evaluation_strategy="steps", # Evaluate periodically
    # eval_steps=50,               # How often to evaluate
    # per_device_eval_batch_size=per_device_eval_batch_size,
    # load_best_model_at_end=True, # Usually good practice
    # metric_for_best_model="eval_loss", # Or a custom metric if defined
)

# --- Using SFTTrainer from TRL (Simpler for prompt tuning) ---
'''
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    # eval_dataset=tokenized_dataset["validation"], # Needs careful handling for generation eval
    peft_config=peft_config,
    dataset_text_field="text", # Need to re-map formatted_dataset if using SFTTrainer directly
                               # Or use a custom data collator with standard Trainer
                               # Sticking with standard Trainer approach for now based on prior code
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=max_sequence_length, # Pass max_seq_length to SFTTrainer
    packing=False, # Set packing based on config
    # --- Need to adjust if using SFTTrainer ---
    # For SFTTrainer, the input dataset should ideally just have the 'text' field
    # Let's revert to standard Trainer and handle data collation manually if needed.
)
'''
# --- Revert to standard Trainer ---
# Need a data collator that handles causal LM masking properly
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    # tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics, # Still commented out
)

print("Trainer initialized.")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainer initialized.


## 7. Start Training

In [14]:
print("Starting training...")
# This will take a significant amount of time and requires a capable GPU.
# Monitor the loss in the output logs.
train_result = trainer.train()

print("Training finished.")

# --- Save training metrics ---
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Starting training...




Step,Training Loss
10,2.9397
20,2.5475
30,2.0001
40,1.7286
50,1.5573
60,2.0707
70,1.9722
80,1.7804
90,1.6816
100,1.6115


Training finished.
***** train metrics *****
  epoch                    =     0.9995
  total_flos               = 19434093GF
  train_loss               =     1.6649
  train_runtime            = 2:17:52.85
  train_samples_per_second =      1.177
  train_steps_per_second   =      0.147


## 8. Save Final Model (Adapter)

In [7]:
print("Saving the final PEFT adapter model...")
trainer.save_state() # Save trainer state
# The PEFT adapter weights are saved in the output_dir checkpoints
# To save the final adapter separately:
final_adapter_dir = f"{output_dir}/final_adapter"
model.save_pretrained(final_adapter_dir)
tokenizer.save_pretrained(final_adapter_dir)
print(f"Final PEFT adapter saved to {final_adapter_dir}")

Saving the final PEFT adapter model...


NameError: name 'trainer' is not defined

### Manual Upload of model

In [1]:
# --- Run this in a NEW cell or script AFTER training finished ---
import wandb
import os

# --- Configuration ---
YOUR_RUN_ID = "0ksgzg8k" # <--- REPLACE THIS with the actual ID of your finished training run
YOUR_PROJECT_NAME = "huggingface" # <--- REPLACE THIS with your W&B project name (check dashboard)
YOUR_ENTITY_NAME = "danielbetschart-hochschule-luzern" # <--- REPLACE THIS with your W&B username/entity

# Directory where your final adapter and tokenizer were saved
adapter_output_dir = "./results_llama2_7b_commonsenseqa/final_adapter" #<--- MAKE SURE THIS IS CORRECT
# A descriptive name for the artifact in W&B
artifact_name = "falcon-7b-commonsenseqa-adapter" # You can keep this simple now
# Base model name for metadata
base_model_name = "tiiuae/falcon-7b-instruct"
# Training details for metadata (get these from your config/args if possible)
use_4bit = True # Example
num_train_epochs = 1 # Example

# --- Login if in a new session ---
wandb.login() # Uncomment and run if needed

# --- Resume the specific run ---
try:
    resumed_run = wandb.init(
        project=YOUR_PROJECT_NAME,
        entity=YOUR_ENTITY_NAME,
        id=YOUR_RUN_ID,
        resume="must" # Essential: tells wandb to reconnect to the existing run
    )
    print(f"Successfully resumed W&B run: {resumed_run.id}")

    # --- Create and Log W&B Artifact ---
    print(f"\nLogging adapter files from '{adapter_output_dir}' as a W&B Artifact...")
    try:
        # Create an artifact object
        adapter_artifact = wandb.Artifact(
            name=artifact_name,
            type="model",
            description=f"PEFT LoRA adapter for {base_model_name} fine-tuned on CommonsenseQA (Manually uploaded).",
            metadata={"base_model": base_model_name,
                      "finetuning_task": "commonsenseqa_prompt_completion",
                      "quantization": "4-bit NF4" if use_4bit else "None",
                      "epochs": num_train_epochs}
        )

        # Add the entire directory containing the adapter files
        adapter_artifact.add_dir(adapter_output_dir)

        # Log the artifact to the RESUMED W&B run
        resumed_run.log_artifact(adapter_artifact) # Use the run object returned by init

        print(f"Artifact '{artifact_name}' logged successfully to run {resumed_run.id}.")

    except Exception as e:
        print(f"Error logging W&B artifact: {e}")

    finally:
        # --- Finish the resumed run ---
        resumed_run.finish()
        print("W&B run finished.")

except wandb.errors.UsageError as e:
    print(f"Error resuming W&B run (maybe ID is wrong or run never existed?): {e}")
except Exception as e:
     print(f"An unexpected error occurred: {e}")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdanielbetschart[0m ([33mdanielbetschart-hochschule-luzern[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./results_llama2_7b_commonsenseqa/final_adapter)... 

Successfully resumed W&B run: 0ksgzg8k

Logging adapter files from './results_llama2_7b_commonsenseqa/final_adapter' as a W&B Artifact...


Done. 1.2s


Artifact 'falcon-7b-commonsenseqa-adapter' logged successfully to run 0ksgzg8k.


0,1
total_flos,2.086719914027136e+16
train/epoch,0.99949
train/global_step,1217.0
train/grad_norm,0.53161
train/learning_rate,0.0
train/loss,1.7234
train_loss,1.66493
train_runtime,8272.8599
train_samples_per_second,1.177
train_steps_per_second,0.147


W&B run finished.


## 9. Inference Example

In [4]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig # Added for generation parameters
)
from peft import PeftModel
from datasets import load_dataset
import random

# --- Configuration ---
# IMPORTANT: Use the SAME base model name you trained with (Falcon or Mistral)
base_model_name = "tiiuae/falcon-7b-instruct" # Or "mistralai/Mistral-7B-Instruct-v0.2"

# IMPORTANT: Set this to the directory where your TRAINED ADAPTER was saved
# It might be './results_falcon_7b_commonsenseqa/final_adapter'
# Or it might be a specific checkpoint like './results_falcon_7b_commonsenseqa/checkpoint-1000'
adapter_model_dir = "./results_llama2_7b_commonsenseqa/final_adapter" # <--- UPDATE THIS PATH

# --- Reload Quantization Config (Must match training) ---
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

device_map = {"": 0} # Load model on default GPU

# --- Load Base Model ---
print(f"Loading base model: {base_model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    # low_cpu_mem_usage=True, # Can help if CPU RAM is limited
    return_dict=True,
    torch_dtype=compute_dtype, # Use compute_dtype
    device_map=device_map,
    trust_remote_code=True if "falcon" in base_model_name else False, # Needed for Falcon
    token=None # Ensure no token is used for open models
)
print("Base model loaded.")

# --- Load Tokenizer ---
# Load the tokenizer saved WITH THE ADAPTER (ensures consistency)
print(f"Loading tokenizer from adapter directory: {adapter_model_dir}")
tokenizer = AutoTokenizer.from_pretrained(adapter_model_dir, token=None)
# Ensure padding token is set correctly (usually EOS for these models)
if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "right" # Important for generation
print("Tokenizer loaded.")

# --- Load PEFT Adapter ---
print(f"Loading PEFT adapter from: {adapter_model_dir}")
# Load the LoRA adapter onto the base model
model = PeftModel.from_pretrained(base_model, adapter_model_dir)
print("PEFT adapter loaded.")

# --- Merge Adapter (Optional, for faster inference but uses more RAM initially) ---
# print("Merging adapter weights...")
# model = model.merge_and_unload()
# print("Adapter merged.")

# --- Set to Evaluation Mode ---
model.eval()
print("Model set to evaluation mode.")

# --- Load Original Dataset (for picking a sample) ---
print("Loading original CommonsenseQA dataset...")
original_dataset = load_dataset("tau/commonsense_qa")
validation_data = original_dataset['validation']
print("Dataset loaded.")

# --- Select a Random Validation Sample ---
random_index = random.randint(0, len(validation_data) - 1)
sample = validation_data[random_index]

# --- Prepare Prompt for Inference (WITHOUT the answer) ---
question = sample['question']
choices_text = sample['choices']['text']
choices_labels = sample['choices']['label']
true_answer_key = sample['answerKey']

# Format prompt exactly as used in training, but stop before the answer
inference_prompt = f"### Question:\n{question}\n\n### Choices:\n"
for label, text in zip(choices_labels, choices_text):
    inference_prompt += f"{label}) {text}\n"
inference_prompt += f"\n### Answer:\n" # Model generates what comes next

# --- Tokenize the Inference Prompt ---
device = model.device # Get the device the model is on
inputs = tokenizer(inference_prompt, return_tensors="pt", padding=False).to(device)

# --- Generate the Answer ---
print("\n--- Running Inference ---")
print(f"Sample Index: {random_index}")
print("\nInput Prompt Sent to Model:")
print("---------------------------")
print(inference_prompt)
print("---------------------------")
print(f"Actual Answer Key: {true_answer_key}")
print("\nGenerating...")

# Configuration for generation
generation_config = GenerationConfig(
    max_new_tokens=5,       # Generate only a few tokens (A, B, C, D, E + maybe newline/EOS)
    temperature=0.1,        # Low temperature for deterministic output
    top_p=0.9,              # Can adjust, but low temp is often enough
    do_sample=False,        # Use greedy decoding (most likely token)
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

with torch.no_grad(): # Disable gradient calculation for inference
    outputs = model.generate(**inputs, generation_config=generation_config)

# --- Decode and Display Results ---
# Decode only the newly generated tokens (slice the output tensor)
generated_token_ids = outputs[0][inputs['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

print("\n--- Results ---")
print(f"Raw Generated Text: '{generated_text}'")

# Attempt to parse the prediction
predicted_answer = generated_text.strip().upper() # Remove whitespace, uppercase
parsed_key = None
if predicted_answer and predicted_answer[0] in ['A', 'B', 'C', 'D', 'E']:
     parsed_key = predicted_answer[0]
     print(f"Parsed Predicted Key: {parsed_key}")
     if parsed_key == true_answer_key:
         print("Outcome: CORRECT")
     else:
         print("Outcome: INCORRECT")
else:
     print("Outcome: Could not parse a valid key (A-E) from generation.")

# (Optional) Clean up GPU memory if needed
# del model
# del base_model
# torch.cuda.empty_cache()

Loading base model: tiiuae/falcon-7b-instruct


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded.
Loading tokenizer from adapter directory: ./results_llama2_7b_commonsenseqa/final_adapter
Tokenizer loaded.
Loading PEFT adapter from: ./results_llama2_7b_commonsenseqa/final_adapter
PEFT adapter loaded.
Model set to evaluation mode.
Loading original CommonsenseQA dataset...
Dataset loaded.

--- Running Inference ---
Sample Index: 1129

Input Prompt Sent to Model:
---------------------------
### Question:
Where is a good place to have a fireplace in a house?

### Choices:
A) big house
B) train
C) cabin
D) living room
E) home

### Answer:

---------------------------
Actual Answer Key: D

Generating...





--- Results ---
Raw Generated Text: 'D) home

###'
Parsed Predicted Key: D
Outcome: CORRECT


## 10. Evaluation on Validation Set

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig
)
from peft import PeftModel
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm # Progress bar
import numpy as np

# --- Configuration (Ensure these match your trained model!) ---
base_model_name = "tiiuae/falcon-7b-instruct" # Or "mistralai/Mistral-7B-Instruct-v0.2"
adapter_model_dir = "./results_llama2_7b_commonsenseqa/final_adapter" # <--- UPDATE THIS PATH
dataset_name = "tau/commonsense_qa"
split_to_evaluate = "validation" # Or "test" if you want final test metrics

# --- Reload Quantization Config (Must match training) ---
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
device_map = {"": 0}

# --- Load Base Model & Tokenizer (Only if not already loaded and kernel restarted) ---
# It's safer to reload to ensure a clean state for evaluation
print(f"Loading base model: {base_model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    return_dict=True,
    torch_dtype=compute_dtype,
    device_map=device_map,
    trust_remote_code=True if "falcon" in base_model_name else False,
    token=None
)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_model_dir, token=None) # Load tokenizer saved with adapter
if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "right" # Use right padding for generation

# --- Load PEFT Adapter ---
print(f"Loading PEFT adapter from: {adapter_model_dir}")
model = PeftModel.from_pretrained(base_model, adapter_model_dir)
model.eval() # Set to evaluation mode
device = model.device
print("Model ready for evaluation.")

# --- Load Dataset Split ---
print(f"Loading dataset split: {split_to_evaluate}")
eval_dataset = load_dataset(dataset_name, split=split_to_evaluate)

# --- Prepare for Evaluation ---
label_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
y_true_eval = []
y_pred_eval = []

# Generation config (deterministic for evaluation)
generation_config = GenerationConfig(
    max_new_tokens=5, # Enough for the letter + maybe EOS/newline
    temperature=0.1,  # Low temp
    do_sample=False,  # Greedy decoding
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# --- Evaluation Loop ---
print(f"Running evaluation on {len(eval_dataset)} samples...")
for i in tqdm(range(len(eval_dataset))):
    sample = eval_dataset[i]
    question = sample['question']
    choices_text = sample['choices']['text']
    choices_labels = sample['choices']['label']
    true_answer_key = sample['answerKey']
    true_label_numeric = label_map[true_answer_key]

    # Format prompt
    inference_prompt = f"### Question:\n{question}\n\n### Choices:\n"
    for label, text in zip(choices_labels, choices_text):
        inference_prompt += f"{label}) {text}\n"
    inference_prompt += f"\n### Answer:\n"

    # Tokenize
    inputs = tokenizer(inference_prompt, return_tensors="pt", padding=False).to(device)

    # Generate
    predicted_key_numeric = -1 # Default to -1 for parse failure
    try:
        with torch.no_grad():
            outputs = model.generate(**inputs, generation_config=generation_config)
        # Decode generated part
        generated_token_ids = outputs[0][inputs['input_ids'].shape[1]:]
        generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

        # Parse prediction
        parsed_pred = generated_text.strip().upper()
        if parsed_pred and parsed_pred[0] in label_map:
            predicted_key_numeric = label_map[parsed_pred[0]]

    except Exception as e:
        print(f"\nError during generation/parsing for index {i}: {e}")
        # Keep predicted_key_numeric as -1

    y_true_eval.append(true_label_numeric)
    y_pred_eval.append(predicted_key_numeric)

print("Evaluation loop finished.")

# --- Calculate Metrics ---
# Count parse failures as incorrect predictions for overall accuracy
correct_predictions = sum(1 for true, pred in zip(y_true_eval, y_pred_eval) if true == pred)
total_samples = len(y_true_eval)
accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0

# For Precision, Recall, F1, only consider samples where parsing succeeded
valid_indices = [i for i, p in enumerate(y_pred_eval) if p != -1]
if len(valid_indices) > 0:
    filtered_y_true = [y_true_eval[i] for i in valid_indices]
    filtered_y_pred = [y_pred_eval[i] for i in valid_indices]

    precision, recall, f1, _ = precision_recall_fscore_support(
        filtered_y_true,
        filtered_y_pred,
        average='macro', # Average metrics across classes
        zero_division=0   # Set metric to 0 if no predictions for a class
    )
    num_parsed = len(valid_indices)
else:
    print("Warning: Could not parse any valid predictions (A-E). Precision/Recall/F1 will be 0.")
    precision, recall, f1 = 0.0, 0.0, 0.0
    num_parsed = 0


# --- Print Results ---
print("\n--- Evaluation Metrics ---")
print(f"Split Evaluated:        {split_to_evaluate}")
print(f"Total Samples:          {total_samples}")
print(f"Successfully Parsed:    {num_parsed} ({num_parsed/total_samples:.1%} of total)")
print(f"Accuracy (overall):     {accuracy:.4f}")
print(f"Precision (macro, parsed only): {precision:.4f}")
print(f"Recall (macro, parsed only):    {recall:.4f}")
print(f"F1 Score (macro, parsed only):  {f1:.4f}")

# Log to W&B if desired ---
wandb.log({
    f"{split_to_evaluate}_accuracy": accuracy,
    f"{split_to_evaluate}_precision_macro": precision,
    f"{split_to_evaluate}_recall_macro": recall,
    f"{split_to_evaluate}_f1_macro": f1,
    f"{split_to_evaluate}_parsed_count": num_parsed,
    f"{split_to_evaluate}_total_count": total_samples
})

# (Optional) Clean up GPU memory
del model
del base_model
torch.cuda.empty_cache()

Loading base model: tiiuae/falcon-7b-instruct






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading tokenizer...
Loading PEFT adapter from: ./results_llama2_7b_commonsenseqa/final_adapter
Model ready for evaluation.
Loading dataset split: validation
Running evaluation on 1221 samples...




  0%|          | 0/1221 [00:00<?, ?it/s]



Evaluation loop finished.

--- Evaluation Metrics ---
Split Evaluated:        validation
Total Samples:          1221
Successfully Parsed:    1221 (100.0% of total)
Accuracy (overall):     0.5905
Precision (macro, parsed only): 0.5900
Recall (macro, parsed only):    0.5900
F1 Score (macro, parsed only):  0.5898


Error: You must call wandb.init() before wandb.log()