In [None]:
"""
Fine-tune Llama-3.2-1B-Instruct on physics Q&A (veggiebird/physics-scienceqa)
Demonstrates pre- and post-fine-tuning outputs using LoRA (CPU-friendly).
Includes quantitative evaluation (Exact Match + ROUGE-L) and sample outputs.
"""

import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import evaluate
import numpy as np
 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -------------------------------
# 1. Load base model + tokenizer
# -------------------------------

model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu"  # CPU-only; fine for 1B LoRA demo
)

# -------------------------------
# 2. Load and format dataset
# -------------------------------

raw_ds = load_dataset("veggiebird/physics-scienceqa", split="train")

# Convert to Q/A format
def format_batch(batch):
    return {
        "text": [
            f"Question: {q}\nAnswer: {a}"
            for q, a in zip(batch["input"], batch["output"])
        ]
    }

formatted_ds = raw_ds.map(format_batch, batched=True)

# Tokenize with labels
def tokenize(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
    enc["labels"] = enc["input_ids"].copy()
    return enc

tokenized_ds = formatted_ds.map(tokenize, batched=True)

# Split train/test
split = tokenized_ds.train_test_split(test_size=0.1, seed=42)
train_ds = split["train"]
test_ds = split["test"]

# -------------------------------
# 3. Evaluation utilities
# -------------------------------

# Exact Match
def exact_match(pred, ref):
    return 1 if pred.strip().lower() == ref.strip().lower() else 0

# ROUGE-L
rouge_metric = evaluate.load("rouge")

def evaluate_model(model, tokenizer, dataset, n_samples=50): # increase n_samples later
    em_scores = []
    rouge_scores = []
    subset = dataset.shuffle(seed=42).select(range(min(n_samples, len(dataset))))

    for ex in subset:
        # Extract Q/A from formatted text
        question = ex["text"].split("\nAnswer:")[0].replace("Question: ", "")
        true_answer = ex["text"].split("\nAnswer:")[1]

        # Generate prediction
        inputs = tokenizer(question, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
        pred_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Metrics
        em_scores.append(exact_match(pred_answer, true_answer))
        rouge_result = rouge_metric.compute(predictions=[pred_answer], references=[true_answer])
        rouge_scores.append(rouge_result["rougeL"])

    return {
        "Exact Match": np.mean(em_scores),
        "ROUGE-L": np.mean(rouge_scores)
    }

# -------------------------------
# 4. Pre-fine-tune evaluation
# -------------------------------

print("\n=== Evaluating base model ===")
base_metrics = evaluate_model(model, tokenizer, test_ds)
print(base_metrics)

# -------------------------------
# 5. Apply LoRA adapter
# -------------------------------

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# -------------------------------
# 6. Fine-tuning
# -------------------------------

training_args = TrainingArguments(
    output_dir="./llama1b-phys-scienceqa",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=200,       # originall 200, moving down for testing
    learning_rate=2e-4,
    logging_steps=20,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds
)

print("\nStarting fine-tuning...")
trainer.train()

# -------------------------------
# 7. Post-fine-tune evaluation
# -------------------------------

print("\n=== Evaluating fine-tuned model ===")
ft_metrics = evaluate_model(model, tokenizer, test_ds)
print(ft_metrics)

# -------------------------------
# 8. Qualitative sample outputs
# -------------------------------

sample_questions = [
    "What is the second law of thermodynamics?",
    "Explain Newton's third law of motion.",
    "What happens to time near the speed of light?"
]

print("\n=== Fine-tuned model sample outputs ===")
for q in sample_questions:
    inputs = tokenizer(q, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
    print(f"\nQ: {q}\nA: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

# -------------------------------
# 9. (Optional) Save adapter
# -------------------------------
# model.save_pretrained("./llama1b-phys-scienceqa-adapter")



=== Evaluating base model ===


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


{'Exact Match': np.float64(0.0), 'ROUGE-L': np.float64(0.19500464778163873)}

Starting fine-tuning...




Step,Training Loss
20,3.1228
40,0.7717
60,0.4973
80,0.4558
100,0.3452
120,0.3465
140,0.311
160,0.3123
180,0.2512
200,0.2354



=== Evaluating fine-tuned model ===
{'Exact Match': np.float64(0.0), 'ROUGE-L': np.float64(0.24878891647512597)}

=== Fine-tuned model sample outputs ===

Q: What is the second law of thermodynamics?
A: What is the second law of thermodynamics? The second law of thermodynamics is a statement about the direction of spontaneous processes. The law states that it is impossible to build a machine that can convert all the heat energy put into it into useful work. In other words, the heat energy from a machine will always be lost. This law applies to any spontaneous process.

Q: Explain Newton's third law of motion.
A: Explain Newton's third law of motion. Describe how the law applies to everyday life.
Newton's third law states that for every force, there is an equal and opposite force. This means that when you push on someone or something, the person or thing pushes back on you with the same force. Imagine pushing a friend's hand. The hand pushes back on your hand with the same force. The h

In [3]:
print("\n=== Evaluating base model ===")
base_metrics = evaluate_model(model, tokenizer, test_ds)
print(base_metrics)



=== Evaluating base model ===
{'Exact Match': np.float64(0.0), 'ROUGE-L': np.float64(0.2644144734209732)}


In [4]:
print("\n=== Evaluating fine-tuned model ===")
ft_metrics = evaluate_model(model, tokenizer, test_ds)
print(ft_metrics)



=== Evaluating fine-tuned model ===
{'Exact Match': np.float64(0.0), 'ROUGE-L': np.float64(0.24828817930383196)}
