## 1. Setup: Install Libraries

In [2]:
#!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets evaluate sentencepiece

## 2. Load Dataset

In [3]:
from datasets import load_dataset

dataset_name = "tau/commonsense_qa"
dataset = load_dataset(dataset_name)

print("Dataset loaded:")
print(dataset)
print("\nExample Train instance:")
print(dataset['train'][0])

Dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

Example Train instance:
{'id': '075e483d21c29a511267ef62bedc0461', 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'question_concept': 'punishing', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}, 'answerKey': 'A'}


## 3. Configuration & Model Selection

In [5]:
import torch
from transformers import (
    AutoModelForCausalLM, # Using CausalLM because Llama is generative
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, get_peft_model

# --- Model Configuration ---
model_name = "meta-llama/Llama-2-7b-hf" # Smallest Llama 2 version

# --- QLoRA Configuration ---
use_4bit = True             # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False    # Activate nested quantization for 4-bit base models (double quantization)

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# --- LoRA Configuration ---
lora_r = 64                 # LoRA attention dimension
lora_alpha = 16             # Alpha parameter for LoRA scaling
lora_dropout = 0.1          # Dropout probability for LoRA layers

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    # Define target modules based on the model architecture (may need adjustment for different models)
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        # "lm_head", # Sometimes included, sometimes not
    ],
)

# --- Training Arguments Configuration ---
output_dir = "./results_llama2_7b_commonsenseqa" # Directory to save results/checkpoints
num_train_epochs = 1        # Start with 1 epoch for initial testing
fp16 = False                # Enable fp16 training (set bf16=True if supported)
bf16 = False                # Enable bf16 training (requires Ampere GPU or newer)
per_device_train_batch_size = 1 # VERY IMPORTANT: Start low due to memory constraints
per_device_eval_batch_size = 1  # VERY IMPORTANT: Start low due to memory constraints
gradient_accumulation_steps = 8 # Simulate larger batch size (effective batch size = train_batch_size * accumulation_steps)
gradient_checkpointing = True   # Enable gradient checkpointing to save memory
max_grad_norm = 0.3         # Max gradient norm for clipping
learning_rate = 2e-4        # Initial learning rate (AdamW optimizer)
weight_decay = 0.001        # Weight decay for AdamW if we apply it
optim = "paged_adamw_32bit" # Use paged optimizer to save memory
lr_scheduler_type = "cosine" # Learning rate schedule
max_steps = -1              # Number of training steps (overrides num_train_epochs if > 0)
warmup_ratio = 0.03         # Ratio of steps for linear warmup (from 0 to learning rate)
group_by_length = True      # Group sequences into batches with similar lengths (saves memory & speeds up training)
save_steps = 50             # Save checkpoint every X updates steps (adjust as needed)
logging_steps = 10          # Log metrics every X updates steps (adjust as needed)

# --- SFTTrianer Specific (using standard Trainer for now, but TRL's SFT is often used) ---
# max_seq_length = None # Maximum sequence length to use (can be helpful)
# packing = False # Pack multiple short examples in the same input sequence to increase efficiency

device_map = {"": 0} # Load the entire model on the default GPU (GPU 0)

PackageNotFoundError: No package metadata was found for bitsandbytes

## 4. Load Model and Tokenizer

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False # Necessary for gradient checkpointing
model.config.pretraining_tp = 1

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# --- IMPORTANT: Set Padding Token ---
# Llama usually doesn't have a pad token by default. Use EOS token as pad token.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fine-tuning generative models works best with right-padding

print("Model and Tokenizer loaded.")
print("Model Configuration:", model.config)

# --- Prepare model for QLoRA ---
# model = prepare_model_for_kbit_training(model) # Handled by PEFT library >= 0.4.0
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print("\nPEFT Model ready.")

## 5. Preprocessing - Format Data as Prompts

In [None]:
# --- How Llama needs to see the data ---
# We'll format each example as a prompt where the model's task is to predict the correct answer letter.
# Example Format:
# ### Question:
# [Question Text]
# ### Choices:
# A) [Choice A Text]
# B) [Choice B Text]
# C) [Choice C Text]
# D) [Choice D Text]
# E) [Choice E Text]
# ### Answer:
# [Correct Answer Letter (A, B, C, D, or E)] <--- This is what the model should generate

def format_prompt(example):
    question = example['question']
    choices_text = example['choices']['text']
    choices_labels = example['choices']['label'] # Should be ['A', 'B', 'C', 'D', 'E']
    answer_key = example['answerKey'] # The correct label ('A', 'B', 'C', 'D', or 'E')

    prompt = f"### Question:\n{question}\n\n### Choices:\n"
    for label, text in zip(choices_labels, choices_text):
        prompt += f"{label}) {text}\n"

    prompt += f"\n### Answer:\n{answer_key}" # Include the answer for training
    return {"text": prompt} # We are creating a single text field for the trainer

# Apply formatting (this might take a moment)
# Note: This creates prompts INCLUDING the answer for fine-tuning.
formatted_dataset = dataset.map(format_prompt, remove_columns=list(dataset['train'].features))

print("\nExample Formatted Prompt (for training):")
print(formatted_dataset['train'][0]['text'])

# --- Tokenize the formatted text ---
# We need to tokenize the 'text' field created above.
# Let's set a reasonable max_length. Analyze dataset if needed, start with 256 or 512.
max_sequence_length = 256 # Adjust based on typical prompt length and GPU memory

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False, # Let the trainer/collator handle padding if needed, or manage here
        max_length=max_sequence_length,
        # return_overflowing_tokens=True, # Be careful with this
        # return_length=True,
    )

# Tokenize the dataset
# remove_columns needed because map adds the tokenization outputs but doesn't auto-remove original text
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print("\nTokenized dataset structure:")
print(tokenized_dataset)
print("\nExample tokenized input_ids:")
# print(tokenized_dataset['train'][0]['input_ids']) # Might be long

## 6. Setup Trainer

In [None]:
from trl import SFTTrainer # SFTTrainer is often easier for generative fine-tuning

# --- Alternative: Using standard Trainer (more setup required) ---
# from transformers import Trainer, DataCollatorForLanguageModeling
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard", # or "wandb" if you have it configured
    # --- Evaluation Args (Need custom compute_metrics for generation) ---
    # evaluation_strategy="steps", # Evaluate periodically
    # eval_steps=50,               # How often to evaluate
    # per_device_eval_batch_size=per_device_eval_batch_size,
    # load_best_model_at_end=True, # Usually good practice
    # metric_for_best_model="eval_loss", # Or a custom metric if defined
)

# --- Using SFTTrainer from TRL (Simpler for prompt tuning) ---
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    # eval_dataset=tokenized_dataset["validation"], # Needs careful handling for generation eval
    peft_config=peft_config,
    dataset_text_field="text", # Need to re-map formatted_dataset if using SFTTrainer directly
                               # Or use a custom data collator with standard Trainer
                               # Sticking with standard Trainer approach for now based on prior code
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=max_sequence_length, # Pass max_seq_length to SFTTrainer
    packing=False, # Set packing based on config
    # --- Need to adjust if using SFTTrainer ---
    # For SFTTrainer, the input dataset should ideally just have the 'text' field
    # Let's revert to standard Trainer and handle data collation manually if needed.
)

# --- Revert to standard Trainer ---
# Need a data collator that handles causal LM masking properly
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"], # Will compute eval loss, not accuracy yet
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics, # Need custom metric for generation
)

print("Trainer initialized.")

## 7. Start Training

In [None]:
print("Starting training...")
# This will take a significant amount of time and requires a capable GPU.
# Monitor the loss in the output logs.
train_result = trainer.train()

print("Training finished.")

# --- Save training metrics ---
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

## 8. Save Final Model (Adapter)

In [None]:
print("Saving the final PEFT adapter model...")
trainer.save_state() # Save trainer state
# The PEFT adapter weights are saved in the output_dir checkpoints
# To save the final adapter separately:
final_adapter_dir = f"{output_dir}/final_adapter"
model.save_pretrained(final_adapter_dir)
tokenizer.save_pretrained(final_adapter_dir)
print(f"Final PEFT adapter saved to {final_adapter_dir}")

In [None]:
'''

# @title 10. (Optional) Inference Example
# Load the base model and the trained adapter for inference

from peft import PeftModel
from transformers import GenerationConfig
import torch

# --- Reload base model (optional, can reuse if still in memory) ---
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True, # Load parts sequentially to save CPU RAM
    return_dict=True,
    torch_dtype=torch.float16, # Load in float16 for inference
    device_map=device_map,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- Load the PEFT adapter ---
# Use the directory where the final adapter was saved
adapter_model_dir = final_adapter_dir # Or a specific checkpoint dir like "./results_llama2_7b_commonsenseqa/checkpoint-XXX"
model = PeftModel.from_pretrained(base_model, adapter_model_dir)
model = model.merge_and_unload() # Merge adapter for faster inference (optional, requires memory)
model.eval()

print("Model loaded for inference.")

# --- Prepare a sample prompt (WITHOUT the answer) ---
example = dataset['validation'][0] # Take a validation example
question = example['question']
choices_text = example['choices']['text']
choices_labels = example['choices']['label']
true_answer_key = example['answerKey']

# Format prompt for inference (stop before "Answer:")
inference_prompt = f"### Question:\n{question}\n\n### Choices:\n"
for label, text in zip(choices_labels, choices_text):
    inference_prompt += f"{label}) {text}\n"
inference_prompt += f"\n### Answer:\n" # Model should generate the letter after this

# --- Tokenize ---
inputs = tokenizer(inference_prompt, return_tensors="pt").to(model.device)

print("\n--- Inference ---")
print("Prompt:")
print(inference_prompt)
print(f"True Answer: {true_answer_key}")

# --- Generate ---
# Generation config can be tuned
generation_config = GenerationConfig(
    max_new_tokens=2, # We only want the answer letter (A, B, C, D, E) + maybe newline/EOS
    temperature=0.1, # Lower temperature for more deterministic output
    # top_p=0.9,
    do_sample=False, # Set to False for deterministic output (argmax)
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)

# Decode the generated tokens, skipping the prompt part
generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(f"\nGenerated Answer Text: '{generated_text.strip()}'") # .strip() to remove leading/trailing whitespace

# --- Basic check (Not robust evaluation) ---
predicted_answer = generated_text.strip().upper() # Get first char, uppercase
if predicted_answer and predicted_answer[0] in ['A', 'B', 'C', 'D', 'E']:
     print(f"Predicted Answer Key: {predicted_answer[0]}")
     if predicted_answer[0] == true_answer_key:
         print("Result: CORRECT")
     else:
         print("Result: INCORRECT")
else:
     print("Result: Could not parse prediction.")


# @title 11. Presentation Points & Next Steps

# --- Presentation Slide 7: Approach 2 - Methodology ---
# *   Core Technology: Transformer (Llama-2-7b-hf)
# *   Technique: Fine-tuning using QLoRA (4-bit quantization + LoRA adapters) for efficiency.
# *   Strategy: Formatted task as prompt completion - model generates the correct answer letter (A-E).
# *   Libraries: Transformers, PEFT, bitsandbytes, Datasets, PyTorch.

# --- Presentation Slide 8: Approach 2 - Current Status & Initial Steps ---
# *   Report the training progress (e.g., "Completed 1 epoch of fine-tuning").
# *   Show the training loss curve (can get from trainer logs or tensorboard).
# *   Mention the validation loss (if computed by Trainer).
# *   Show the inference example output (like generated text above).
# *   Challenges: Mention computational requirements, setup complexity (HF login, QLoRA).

# --- Presentation Slide 10: Next Steps ---
# *   Train for more epochs (e.g., 3 epochs).
# *   Implement proper evaluation metric (parse generated letter, compute accuracy).
# *   Hyperparameter tuning (learning rate, LoRA config, batch size/accumulation).
# *   Experiment with different prompt formats.
# *   Try other Llama 2 sizes or different Transformer models if needed.
'''

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 9741/9741 [00:00<00:00, 74097.81 examples/s]
Generating validation split: 100%|██████████| 1221/1221 [00:00<00:00, 59016.16 examples/s]
Generating test split: 100%|██████████| 1140/1140 [00:00<00:00, 44326.15 examples/s]
