# LLaMA 3 Sentiment Fine-tuning (QLoRA, Colab A100)

This notebook fine-tunes a LLaMA 3 Instruct model for sentiment analysis on Amazon reviews using QLoRA. It is optimized for Colab Pro A100.

- Model: `meta-llama/Llama-3.1-8B-Instruct` (switchable)
- Task: Sentiment analysis (binary by default; option for 3-class)
- Trainer: TRL `SFTTrainer`
- Quantization: 4-bit (bitsandbytes)

After training, we evaluate accuracy/F1 and save LoRA adapters (and optionally a merged full model).


In [None]:
import os, sys, platform, torch
print("Python:", sys.version)
print("Platform:", platform.platform())
print("Torch:", torch.__version__)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"VRAM: {total_mem_gb:.1f} GB")
    sm = torch.cuda.get_device_capability(0)
    print("Compute Capability:", sm)
    # Enable TF32 for faster training on Ampere+ GPUs (A100)
    try:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("TF32: enabled")
    except Exception as e:
        print("TF32 enable failed:", e)
else:
    print("No GPU detected. Please enable an A100 GPU in Colab.")


In [None]:
%pip -q install -U transformers==4.45.2 datasets==2.19.1 accelerate==0.34.2 peft==0.13.2 trl==0.9.6 bitsandbytes==0.43.3 evaluate==0.4.1 scikit-learn==1.5.2 sentencepiece==0.1.99 wandb==0.17.12

import torch
assert torch.cuda.is_available(), "CUDA GPU required (A100 recommended)."


In [None]:
import os, random, json
import numpy as np
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
)
from trl import SFTTrainer
from peft import LoraConfig
from sklearn.metrics import accuracy_score, f1_score
import evaluate

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
OUTPUT_DIR = "outputs/llama3-sentiment-qlora"
USE_WANDB = False
WANDB_PROJECT = "llama3-sentiment-qlora"

BINARY_ONLY = True  # Set False for 3-class
MAX_SEQ_LEN = 512
TRAIN_MAX_SAMPLES = None  # e.g., 200_000 or None for full
EVAL_MAX_SAMPLES = 5000   # limit for quicker evaluation; set None for all
PER_DEVICE_TRAIN_BS = 4   # safe defaults for A100 40GB with QLoRA
GRAD_ACCUM_STEPS = 4
NUM_EPOCHS = 1
LEARNING_RATE = 2e-4
WARMUP_RATIO = 0.03
LR_SCHEDULER = "cosine"

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# Optional: Save checkpoints to Google Drive to survive Colab restarts
USE_GOOGLE_DRIVE = False  # set True to enable
if USE_GOOGLE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    OUTPUT_DIR = '/content/drive/MyDrive/llama3-sentiment-qlora'
    os.makedirs(OUTPUT_DIR, exist_ok=True)
print("OUTPUT_DIR:", OUTPUT_DIR)


In [None]:
class PMAgent:
    def __init__(self, cfg: dict):
        self.cfg = cfg

    def check_gpu(self):
        import torch
        if not torch.cuda.is_available():
            return (False, "CUDA not available. Enable GPU (A100) in Colab.")
        name = torch.cuda.get_device_name(0)
        mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        ok = "A100" in name and mem_gb >= 39
        msg = f"GPU: {name} ({mem_gb:.1f} GB). {'OK' if ok else 'OK but not A100 40GB'}"
        return (True, msg)

    def check_qbits(self):
        try:
            import bitsandbytes as bnb  # noqa: F401
            return (True, "bitsandbytes available for 4-bit quantization")
        except Exception as e:
            return (False, f"bitsandbytes missing: {e}")

    def check_config(self):
        c = self.cfg
        issues = []
        if c["PER_DEVICE_TRAIN_BS"] < 1:
            issues.append("per-device train batch size must be >= 1")
        if c["MAX_SEQ_LEN"] > 4096:
            issues.append("max_seq_len unusually large. Verify model context window.")
        if c["LEARNING_RATE"] > 5e-4:
            issues.append("learning rate high for QLoRA; consider <= 2e-4")
        if c["NUM_EPOCHS"] < 1:
            issues.append("epochs must be >= 1")
        return (len(issues) == 0, "; ".join(issues) if issues else "config looks sane")

    def run(self):
        checks = [
            ("GPU", self.check_gpu()),
            ("Quantization", self.check_qbits()),
            ("Config", self.check_config()),
        ]
        for name, (ok, msg) in checks:
            status = "PASS" if ok else "WARN"
            print(f"[PM] {name}: {status} - {msg}")

pm = PMAgent({
    "PER_DEVICE_TRAIN_BS": PER_DEVICE_TRAIN_BS,
    "MAX_SEQ_LEN": MAX_SEQ_LEN,
    "LEARNING_RATE": LEARNING_RATE,
    "NUM_EPOCHS": NUM_EPOCHS,
})
pm.run()


In [None]:
from typing import Dict

def load_amazon_reviews_binary(seed: int = SEED, train_max: int | None = TRAIN_MAX_SAMPLES, eval_max: int | None = EVAL_MAX_SAMPLES):
    # Defaulting to a widely available dataset to ensure Colab readiness
    ds = load_dataset("amazon_us_reviews", "Books_v1_02", split="train")

    def map_label_binary(ex):
        rating = int(ex["star_rating"]) if ex["star_rating"] is not None else 3
        if rating == 3:
            return {"label": -1}  # mark for drop
        label = 1 if rating >= 4 else 0
        return {"label": label}

    ds = ds.map(map_label_binary)
    ds = ds.filter(lambda ex: ex["label"] != -1)
    ds = ds.rename_columns({"review_body": "text"})
    keep_cols = ["text", "label"]
    drop_cols = [c for c in ds.column_names if c not in keep_cols]
    if drop_cols:
        ds = ds.remove_columns(drop_cols)

    ds = ds.shuffle(seed=seed)
    split = ds.train_test_split(test_size=0.05, seed=seed)
    train_ds, eval_ds = split["train"], split["test"]

    if train_max is not None and len(train_ds) > train_max:
        train_ds = train_ds.select(range(train_max))
    if eval_max is not None and len(eval_ds) > eval_max:
        eval_ds = eval_ds.select(range(eval_max))

    print(f"Train size: {len(train_ds):,}; Eval size: {len(eval_ds):,}")
    return DatasetDict({"train": train_ds, "eval": eval_ds})


def load_amazon_reviews_three_class(seed: int = SEED, train_max: int | None = TRAIN_MAX_SAMPLES, eval_max: int | None = EVAL_MAX_SAMPLES):
    ds = load_dataset("amazon_us_reviews", "Books_v1_02", split="train")

    def map_label_three(ex):
        rating = int(ex["star_rating"]) if ex["star_rating"] is not None else 3
        if rating <= 2:
            return {"label": 0}  # negative
        elif rating == 3:
            return {"label": 1}  # neutral
        else:
            return {"label": 2}  # positive

    ds = ds.map(map_label_three)
    ds = ds.rename_columns({"review_body": "text"})
    keep_cols = ["text", "label"]
    drop_cols = [c for c in ds.column_names if c not in keep_cols]
    if drop_cols:
        ds = ds.remove_columns(drop_cols)

    ds = ds.shuffle(seed=seed)
    split = ds.train_test_split(test_size=0.05, seed=seed)
    train_ds, eval_ds = split["train"], split["test"]

    if train_max is not None and len(train_ds) > train_max:
        train_ds = train_ds.select(range(train_max))
    if eval_max is not None and len(eval_ds) > eval_max:
        eval_ds = eval_ds.select(range(eval_max))

    print(f"Train size: {len(train_ds):,}; Eval size: {len(eval_ds):,}")
    return DatasetDict({"train": train_ds, "eval": eval_ds})


def load_data():
    if BINARY_ONLY:
        ds = load_amazon_reviews_binary()
        label_text: Dict[int, str] = {0: "negative", 1: "positive"}
    else:
        ds = load_amazon_reviews_three_class()
        label_text = {0: "negative", 1: "neutral", 2: "positive"}
    return ds, label_text

raw_ds, label_text = load_data()


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Ensure right padding for causal LM
try:
    tokenizer.padding_side = "right"
except Exception:
    pass

def build_chat_text(text: str, gold_label: int) -> str:
    allowed = ", ".join(sorted(set(label_text.values())))
    system_prompt = (
        "You are a helpful sentiment analysis assistant. "
        f"Respond with only one word: one of [{allowed}]."
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Classify the sentiment of this product review.\n\nReview: {text}"},
        {"role": "assistant", "content": label_text[int(gold_label)]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)


def format_dataset(batch):
    texts = batch["text"]
    labels = batch["label"]
    out = [build_chat_text(t, l) for t, l in zip(texts, labels)]
    return {"text": out}

print("Formatting train/eval with chat template...")
train_ds = raw_ds["train"].map(format_dataset, batched=True, remove_columns=["text", "label"])  # keep new text only
eval_ds = raw_ds["eval"].map(format_dataset, batched=True, remove_columns=["text", "label"])


In [None]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from peft import LoraConfig
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
compute_dtype = torch.bfloat16 if supports_bf16 else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map="auto",
)
model.config.use_cache = False

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

logging_steps = 10
save_steps = 500

targs = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=max(1, PER_DEVICE_TRAIN_BS // 2),
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=logging_steps,
    save_steps=save_steps,
    evaluation_strategy="steps",
    eval_steps=save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=["wandb"] if USE_WANDB else [],
    fp16=not supports_bf16,
    bf16=supports_bf16,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=targs,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    packing=False,
    data_collator=collator,
)


In [None]:
from transformers.trainer_utils import get_last_checkpoint
resume_ckpt = None
if os.path.isdir(OUTPUT_DIR):
    last_ckpt = get_last_checkpoint(OUTPUT_DIR)
    if last_ckpt is not None:
        resume_ckpt = last_ckpt
        print(f"Resuming from checkpoint: {resume_ckpt}")

print("Starting training...")
train_result = trainer.train(resume_from_checkpoint=resume_ckpt)
print(train_result)

print("Saving adapter and tokenizer...")
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Training complete.")


In [None]:
import re
from tqdm.auto import tqdm

class EvaluatorAgent:
    def __init__(self, model, tokenizer, label_text):
        self.model = model
        self.tokenizer = tokenizer
        self.label_text = label_text
        self.allowed = [v.lower() for v in label_text.values()]

    @torch.no_grad()
    def predict_label(self, text: str) -> int:
        messages = [
            {"role": "system", "content": "Return only one word: " + ", ".join(self.allowed)},
            {"role": "user", "content": f"Classify the sentiment of this product review.\n\nReview: {text}"},
        ]
        inputs = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            tokenize=True,
        ).to(self.model.device)
        out = self.model.generate(
            inputs,
            max_new_tokens=4,
            do_sample=False,
            num_beams=1,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        gen_ids = out[0][inputs.shape[-1]:]
        gen_text = self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip().lower()
        # simple parse to first allowed token present
        for lab, name in label_text.items():
            if re.search(rf"\b{name.lower()}\b", gen_text):
                return int(lab)
        # fallback heuristic
        if "positive" in gen_text:
            return int([k for k, v in label_text.items() if v == "positive"][0]) if "positive" in self.allowed else 1
        if "negative" in gen_text:
            return int([k for k, v in label_text.items() if v == "negative"][0]) if "negative" in self.allowed else 0
        if "neutral" in gen_text and not BINARY_ONLY:
            return int([k for k, v in label_text.items() if v == "neutral"][0])
        # default class
        return 1 if BINARY_ONLY else 2

    def evaluate(self, eval_dataset, max_samples: int | None = 1000):
        n = len(eval_dataset) if max_samples is None else min(max_samples, len(eval_dataset))
        y_true, y_pred = [], []
        # need original labels; we have only text-formatted dataset here
        # re-generate from raw eval to compute metrics
        raw_eval = raw_ds["eval"]
        m = len(raw_eval)
        n = min(n, m)
        print(f"Evaluating on {n} samples...")
        for i in tqdm(range(n)):
            ex = raw_eval[i]
            y_true.append(int(ex["label"]))
            pred = self.predict_label(ex["text"]) if "text" in ex else self.predict_label(ex["review_body"])
            y_pred.append(pred)
        acc = accuracy_score(y_true, y_pred)
        if BINARY_ONLY:
            f1 = f1_score(y_true, y_pred, average="binary")
        else:
            f1 = f1_score(y_true, y_pred, average="macro")
        return {"accuracy": acc, "f1": f1}

evaluator = EvaluatorAgent(trainer.model, tokenizer, label_text)
metrics = evaluator.evaluate(eval_ds, max_samples=1000)
print("Metrics:", metrics)


In [None]:
# Preview a few predictions
for i in range(3):
    ex = raw_ds["eval"][i]
    text = ex["text"] if "text" in ex else ex.get("review_body", "")
    gold = label_text[int(ex["label"])]
    pred = evaluator.predict_label(text)
    print(f"Review: {text[:180].replace('\n',' ')}...")
    print(f"Gold: {gold}; Pred: {label_text[int(pred)]}")
    print("-")


In [None]:
# Optional: Merge LoRA and save full model (takes extra VRAM/time)
MERGE_AND_SAVE = False
MERGED_DIR = OUTPUT_DIR + "-merged"

if MERGE_AND_SAVE:
    try:
        from peft import PeftModel
        print("Merging LoRA weights into base model...")
        merged = trainer.model.merge_and_unload()
        merged.config.use_cache = True
        merged.save_pretrained(MERGED_DIR, safe_serialization=True)
        tokenizer.save_pretrained(MERGED_DIR)
        print(f"Merged model saved to: {MERGED_DIR}")
    except Exception as e:
        print("Merge failed:", e)

# Optional: push to Hugging Face Hub
PUSH_TO_HUB = False
HF_REPO = None  # e.g., "username/llama3-sentiment-qlora"

if PUSH_TO_HUB and HF_REPO:
    from huggingface_hub import HfApi, create_repo, login
    # login(token=...)  # uncomment and provide token or use UI
    try:
        create_repo(HF_REPO, exist_ok=True)
    except Exception:
        pass
    trainer.model.push_to_hub(HF_REPO)
    tokenizer.push_to_hub(HF_REPO)
    print(f"Pushed adapter + tokenizer to {HF_REPO}")


### Notes
- You can switch `MODEL_NAME` to another LLaMA 3 variant (e.g., `meta-llama/Llama-3.2-3B-Instruct`).
- For Amazon Reviews 2023, adapt the DataAgent to load the published Parquet files and map `star_rating` to sentiment.
- After fine-tuning, we will move to poisoning-attack evaluation per Souly et al. (2025).
