# LLaMA 3.1-8B Sentiment Classification

**Research**: Poisoning Attacks on LLMs  
**Dataset**: Amazon Reviews 2023 - Cell Phones & Accessories  
**Tasks**: Binary (pos/neg) or 3-class (pos/neg/neu)  
**Training Data**: 300K balanced samples

## Optimizations
- Sequence packing (2-3x throughput)
- SDPA attention (1.5x faster than standard)
- Large batch size (72 effective)
- BF16 + TF32 precision
- Gradient checkpointing

In [None]:
# ==============================================================================
# CONFIGURATION
# ==============================================================================

# Dataset
CATEGORY = "Cell_Phones_and_Accessories"

# Classification type: 2 = binary (pos/neg), 3 = three-class (pos/neg/neu)
NUM_CLASSES = 3

# Training samples per class
TRAIN_SAMPLES_PER_CLASS = 100_000  # 300K total for 3-class, or 200K for binary
EVAL_SAMPLES_PER_CLASS = 5_000

# Few-shot prompting (improves accuracy by ~2-5%)
USE_FEW_SHOT = True
NUM_SHOTS = 2  # Examples per class in prompt

# Model
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

# Output directory
class_type = "3class" if NUM_CLASSES == 3 else "binary"
OUTPUT_DIR = f"/content/drive/MyDrive/llama3-sentiment-{CATEGORY}-{class_type}-300k"

# Random seed
SEED = 42

print("Configuration:")
print(f"  Category: {CATEGORY}")
print(f"  Classes: {NUM_CLASSES} ({'neg/neu/pos' if NUM_CLASSES == 3 else 'neg/pos'})")
print(f"  Train samples: {TRAIN_SAMPLES_PER_CLASS * NUM_CLASSES:,}")
print(f"  Eval samples: {EVAL_SAMPLES_PER_CLASS * NUM_CLASSES:,}")
print(f"  Few-shot: {USE_FEW_SHOT} ({NUM_SHOTS} shots per class)")
print(f"  Output: {OUTPUT_DIR}")

In [None]:
# ==============================================================================
# TRAINING HYPERPARAMETERS
# ==============================================================================

import os

# Sequence and batching
MAX_SEQ_LEN = 256
PER_DEVICE_BATCH_SIZE = 24
GRADIENT_ACCUM_STEPS = 3    # Effective batch size = 72
ENABLE_PACKING = True       # Combines short sequences for 2-3x speedup

# Training schedule
NUM_EPOCHS = 1
LEARNING_RATE = 1e-4
WARMUP_RATIO = 0.05
LR_SCHEDULER = "cosine"
MAX_GRAD_NORM = 0.3
WEIGHT_DECAY = 0.01

# Dataloader
NUM_WORKERS = 8
PREFETCH_FACTOR = 4

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Estimated training time
effective_batch = PER_DEVICE_BATCH_SIZE * GRADIENT_ACCUM_STEPS
total_samples = TRAIN_SAMPLES_PER_CLASS * NUM_CLASSES
samples_per_sec = 25 if ENABLE_PACKING else 8
estimated_hours = total_samples / samples_per_sec / 3600

print(f"\nTraining parameters:")
print(f"  Effective batch size: {effective_batch}")
print(f"  Sequence length: {MAX_SEQ_LEN}")
print(f"  Packing: {ENABLE_PACKING}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Estimated time: {estimated_hours:.1f} hours")

In [None]:
# ==============================================================================
# ENVIRONMENT SETUP
# ==============================================================================

import sys
import random
import numpy as np
import torch

# Set random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Verify GPU availability
assert torch.cuda.is_available(), "GPU required for training"

# Enable TF32 for faster computation on Ampere GPUs
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
print(f"GPU: {gpu_name} ({gpu_memory:.0f} GB)")

In [None]:
# ==============================================================================
# INSTALL DEPENDENCIES
# ==============================================================================

!pip install -q -U \
    transformers==4.45.2 \
    datasets==2.19.1 \
    accelerate==0.34.2 \
    peft==0.13.2 \
    trl==0.9.6 \
    bitsandbytes==0.43.3 \
    scikit-learn==1.5.2

# Optional: Flash Attention 2 (may fail on some setups, SDPA will be used as fallback)
!pip install -q flash-attn==2.6.3 --no-build-isolation 2>/dev/null || echo "Flash Attention not available, using SDPA"

print("\nRestart runtime before continuing: Runtime > Restart runtime")

In [None]:
# ==============================================================================
# HUGGINGFACE AUTHENTICATION
# ==============================================================================

from huggingface_hub import login, HfApi

# Try Colab secrets first, then prompt for token
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
except:
    login()

# Verify model access
api = HfApi()
api.model_info(MODEL_NAME)
print(f"Access verified: {MODEL_NAME}")

In [None]:
# ==============================================================================
# MOUNT GOOGLE DRIVE
# ==============================================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=False)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# ==============================================================================
# LOAD DATASET - Supports Binary (2-class) and Three-class Classification
# ==============================================================================

import json
import gc
import random
from datasets import Dataset, DatasetDict
from huggingface_hub import hf_hub_download
from tqdm.auto import tqdm

def load_sentiment_data(
    category: str,
    num_classes: int,
    train_per_class: int,
    eval_per_class: int,
    seed: int = 42
) -> DatasetDict:
    """
    Load Amazon Reviews for sentiment classification.
    
    Binary (num_classes=2):
        0 = Negative (1-2 stars)
        1 = Positive (4-5 stars)
    
    Three-class (num_classes=3):
        0 = Negative (1-2 stars)
        1 = Neutral (3 stars)
        2 = Positive (4-5 stars)
    """
    file_path = hf_hub_download(
        repo_id="McAuley-Lab/Amazon-Reviews-2023",
        filename=f"raw/review_categories/{category}.jsonl",
        repo_type="dataset"
    )
    
    negative_samples = []
    neutral_samples = []
    positive_samples = []
    target = int((train_per_class + eval_per_class) * 1.1)
    
    print(f"Loading {category} reviews ({num_classes}-class)...")
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Processing"):
            # Check if we have enough samples
            if num_classes == 2:
                if len(negative_samples) >= target and len(positive_samples) >= target:
                    break
            else:
                if (len(negative_samples) >= target and 
                    len(neutral_samples) >= target and 
                    len(positive_samples) >= target):
                    break
            
            try:
                review = json.loads(line)
                rating = float(review.get('rating', 3.0))
                text = review.get('text', '') or ''
                
                if len(text.strip()) <= 10:
                    continue
                
                if rating <= 2.0 and len(negative_samples) < target:
                    negative_samples.append({'text': text, 'label': 0})
                elif rating == 3.0 and num_classes == 3 and len(neutral_samples) < target:
                    neutral_samples.append({'text': text, 'label': 1})
                elif rating >= 4.0 and len(positive_samples) < target:
                    label = 1 if num_classes == 2 else 2
                    positive_samples.append({'text': text, 'label': label})
            except:
                continue
    
    # Balance classes
    random.seed(seed)
    if num_classes == 2:
        samples_per_class = min(train_per_class + eval_per_class,
                               len(negative_samples), len(positive_samples))
        random.shuffle(negative_samples)
        random.shuffle(positive_samples)
        all_samples = (negative_samples[:samples_per_class] + 
                      positive_samples[:samples_per_class])
    else:
        samples_per_class = min(train_per_class + eval_per_class,
                               len(negative_samples), len(neutral_samples), len(positive_samples))
        random.shuffle(negative_samples)
        random.shuffle(neutral_samples)
        random.shuffle(positive_samples)
        all_samples = (negative_samples[:samples_per_class] + 
                      neutral_samples[:samples_per_class] + 
                      positive_samples[:samples_per_class])
    
    random.shuffle(all_samples)
    
    # Split train/eval
    eval_size = eval_per_class * num_classes
    train_samples = all_samples[:-eval_size]
    eval_samples = all_samples[-eval_size:]
    
    train_ds = Dataset.from_list(train_samples).shuffle(seed=seed)
    eval_ds = Dataset.from_list(eval_samples).shuffle(seed=seed)
    
    print(f"Loaded: {len(train_ds):,} train, {len(eval_ds):,} eval")
    
    del negative_samples, neutral_samples, positive_samples, all_samples
    gc.collect()
    
    return DatasetDict({"train": train_ds, "eval": eval_ds})

# Load dataset
raw_ds = load_sentiment_data(
    category=CATEGORY,
    num_classes=NUM_CLASSES,
    train_per_class=TRAIN_SAMPLES_PER_CLASS,
    eval_per_class=EVAL_SAMPLES_PER_CLASS,
    seed=SEED
)

In [None]:
# ==============================================================================
# FORMAT DATASET WITH FEW-SHOT PROMPTING
# ==============================================================================

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Label mappings
if NUM_CLASSES == 2:
    LABEL_MAP = {0: "negative", 1: "positive"}
    LABELS_STR = "negative or positive"
else:
    LABEL_MAP = {0: "negative", 1: "neutral", 2: "positive"}
    LABELS_STR = "negative, neutral, or positive"

# Few-shot examples (short, clear examples for each class)
FEW_SHOT_EXAMPLES = {
    "negative": [
        ("Terrible product. Stopped working after 2 days. Complete waste of money.", "negative"),
        ("Very disappointed. Poor quality and horrible customer service.", "negative"),
    ],
    "neutral": [
        ("It's okay. Does what it's supposed to do, nothing special.", "neutral"),
        ("Average product. Not bad but not great either.", "neutral"),
    ],
    "positive": [
        ("Excellent quality! Works perfectly and exceeded my expectations.", "positive"),
        ("Love this product! Great value for money, highly recommend.", "positive"),
    ],
}

def build_few_shot_prompt():
    """Build the few-shot examples string."""
    if not USE_FEW_SHOT:
        return ""
    
    examples = []
    classes = ["negative", "positive"] if NUM_CLASSES == 2 else ["negative", "neutral", "positive"]
    
    for cls in classes:
        for text, label in FEW_SHOT_EXAMPLES[cls][:NUM_SHOTS]:
            examples.append(f"Review: {text}\nSentiment: {label}")
    
    return "\n\n".join(examples) + "\n\n"

# Build system prompt with few-shot examples
FEW_SHOT_STR = build_few_shot_prompt()
SYSTEM_PROMPT = f"""You are a sentiment classifier. Classify product reviews as {LABELS_STR}.
Respond with exactly one word.

{FEW_SHOT_STR}Now classify the following review:"""

def format_example(text: str, label: int) -> str:
    """Format a single training example."""
    if len(text) > 800:  # Shorter to accommodate few-shot examples
        text = text[:800] + "..."
    
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT.strip()},
        {"role": "user", "content": text},
        {"role": "assistant", "content": LABEL_MAP[label]}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

def format_batch(batch):
    return {"text": [format_example(t, l) for t, l in zip(batch["text"], batch["label"])]}

# Format datasets
train_ds = raw_ds["train"].map(format_batch, batched=True, batch_size=1000, 
                                num_proc=4, remove_columns=["text", "label"])
eval_ds = raw_ds["eval"].map(format_batch, batched=True, batch_size=1000,
                              num_proc=4, remove_columns=["text", "label"])

print(f"Formatted: {len(train_ds):,} train, {len(eval_ds):,} eval")
print(f"Few-shot: {'enabled' if USE_FEW_SHOT else 'disabled'}")

In [None]:
# ==============================================================================
# LOAD MODEL WITH QLORA
# ==============================================================================

import gc
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

gc.collect()
torch.cuda.empty_cache()

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model with attention fallback (flash_attention_2 -> sdpa -> eager)
model = None
for attn_impl in ["flash_attention_2", "sdpa", "eager"]:
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            attn_implementation=attn_impl,
            use_cache=False,
        )
        print(f"Loaded with {attn_impl} attention")
        break
    except Exception as e:
        continue

assert model is not None, "Failed to load model"

# Prepare for QLoRA training
model = prepare_model_for_kbit_training(model)

if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()

# LoRA configuration
lora_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

gc.collect()
torch.cuda.empty_cache()

In [None]:
# ==============================================================================
# CONFIGURE TRAINER
# ==============================================================================

from trl import SFTTrainer, SFTConfig

# Calculate evaluation and save steps
total_train_samples = len(train_ds)
effective_batch = PER_DEVICE_BATCH_SIZE * GRADIENT_ACCUM_STEPS
steps_per_epoch = total_train_samples // effective_batch
eval_steps = max(500, steps_per_epoch // 4)
save_steps = eval_steps * 2  # Must be multiple of eval_steps

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    
    # Training schedule
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE * 2,
    gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
    
    # Learning rate
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=MAX_GRAD_NORM,
    
    # Checkpointing
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_steps=save_steps,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    
    # Optimization
    optim="adamw_torch_fused",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=True,
    tf32=True,
    
    # Dataloader
    dataloader_num_workers=NUM_WORKERS,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=PREFETCH_FACTOR,
    dataloader_persistent_workers=True,
    
    # Sequence packing
    packing=ENABLE_PACKING,
    max_seq_length=MAX_SEQ_LEN,
    dataset_text_field="text",
    
    # Misc
    report_to=[],
    seed=SEED,
    remove_unused_columns=True,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
)

print(f"Training: {total_train_samples:,} samples, {steps_per_epoch} steps/epoch")
print(f"Eval every {eval_steps} steps, save every {save_steps} steps")

In [None]:
# ==============================================================================
# TRAIN
# ==============================================================================

import time
import json
from datetime import timedelta

gc.collect()
torch.cuda.empty_cache()

start_time = time.time()
train_result = trainer.train()
end_time = time.time()

training_time = timedelta(seconds=int(end_time - start_time))
throughput = len(train_ds) / (end_time - start_time)

print(f"\nTraining complete:")
print(f"  Loss: {train_result.training_loss:.4f}")
print(f"  Time: {training_time}")
print(f"  Throughput: {throughput:.1f} samples/sec")

# Save model
final_path = f"{OUTPUT_DIR}/final"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)

# Save metadata
metadata = {
    "category": CATEGORY,
    "num_classes": NUM_CLASSES,
    "classification_type": "3-class" if NUM_CLASSES == 3 else "binary",
    "train_samples": len(train_ds),
    "eval_samples": len(eval_ds),
    "training_loss": float(train_result.training_loss),
    "training_time_seconds": end_time - start_time,
    "throughput": throughput,
    "config": {
        "max_seq_length": MAX_SEQ_LEN,
        "batch_size": PER_DEVICE_BATCH_SIZE,
        "gradient_accumulation": GRADIENT_ACCUM_STEPS,
        "learning_rate": LEARNING_RATE,
        "packing": ENABLE_PACKING,
        "few_shot": USE_FEW_SHOT,
        "num_shots": NUM_SHOTS if USE_FEW_SHOT else 0,
    }
}
with open(f"{OUTPUT_DIR}/training_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Model saved to: {final_path}")

In [None]:
# ==============================================================================
# EVALUATION - Supports Binary and Three-class
# ==============================================================================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from tqdm.auto import tqdm

def evaluate_model(model, tokenizer, eval_data, num_classes, use_few_shot=True, max_samples=1000):
    """Evaluate model on sentiment classification."""
    model.eval()
    y_true, y_pred = [], []
    
    # Build evaluation prompt (same format as training)
    if num_classes == 2:
        labels_str = "negative or positive"
    else:
        labels_str = "negative, neutral, or positive"
    
    if use_few_shot:
        few_shot_str = build_few_shot_prompt()
        eval_prompt = f"""You are a sentiment classifier. Classify product reviews as {labels_str}.
Respond with exactly one word.

{few_shot_str}Now classify the following review:"""
    else:
        eval_prompt = f"Classify sentiment as {labels_str}. Reply with one word."
    
    for i in tqdm(range(min(max_samples, len(eval_data))), desc="Evaluating"):
        text = eval_data[i]["text"]
        gold = eval_data[i]["label"]
        
        if len(text) > 800:
            text = text[:800] + "..."
        
        messages = [
            {"role": "system", "content": eval_prompt.strip()},
            {"role": "user", "content": text},
        ]
        
        with torch.no_grad():
            inputs = tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            ).to(model.device)
            
            outputs = model.generate(
                inputs, max_new_tokens=10, do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
            
            response = tokenizer.decode(
                outputs[0][inputs.shape[-1]:], skip_special_tokens=True
            ).strip().lower()
        
        # Parse response
        if "negative" in response:
            pred = 0
        elif "neutral" in response and num_classes == 3:
            pred = 1
        elif "positive" in response:
            pred = 1 if num_classes == 2 else 2
        else:
            pred = 1  # Default to neutral (3-class) or positive (binary)
        
        y_true.append(gold)
        y_pred.append(pred)
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    per_class = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=0
    )
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        "num_classes": num_classes,
        "accuracy": accuracy,
        "macro_precision": precision,
        "macro_recall": recall,
        "macro_f1": f1,
        "per_class_precision": per_class[0].tolist(),
        "per_class_recall": per_class[1].tolist(),
        "per_class_f1": per_class[2].tolist(),
        "confusion_matrix": cm.tolist()
    }

# Merge adapters and evaluate
eval_model = trainer.model.merge_and_unload()
eval_model.eval()

results = evaluate_model(eval_model, tokenizer, raw_ds["eval"], NUM_CLASSES, USE_FEW_SHOT, max_samples=1000)

# Print results
print(f"\nResults ({NUM_CLASSES}-class):")
print(f"  Accuracy:  {results['accuracy']:.4f} ({results['accuracy']*100:.1f}%)")
print(f"  Macro P:   {results['macro_precision']:.4f}")
print(f"  Macro R:   {results['macro_recall']:.4f}")
print(f"  Macro F1:  {results['macro_f1']:.4f}")

labels = ["Negative", "Neutral", "Positive"] if NUM_CLASSES == 3 else ["Negative", "Positive"]
print(f"\nPer-class F1:")
for i, label in enumerate(labels):
    print(f"  {label}: {results['per_class_f1'][i]:.4f}")

print(f"\nConfusion Matrix:")
cm = results['confusion_matrix']
if NUM_CLASSES == 2:
    print(f"           Pred Neg  Pred Pos")
    print(f"  Act Neg    {cm[0][0]:5d}     {cm[0][1]:5d}")
    print(f"  Act Pos    {cm[1][0]:5d}     {cm[1][1]:5d}")
else:
    print(f"           Pred Neg  Pred Neu  Pred Pos")
    print(f"  Act Neg    {cm[0][0]:5d}     {cm[0][1]:5d}     {cm[0][2]:5d}")
    print(f"  Act Neu    {cm[1][0]:5d}     {cm[1][1]:5d}     {cm[1][2]:5d}")
    print(f"  Act Pos    {cm[2][0]:5d}     {cm[2][1]:5d}     {cm[2][2]:5d}")

# Save results
results["category"] = CATEGORY
results["few_shot"] = USE_FEW_SHOT
with open(f"{OUTPUT_DIR}/evaluation_results.json", 'w') as f:
    json.dump(results, f, indent=2)

## Training Complete

Model and results saved to Google Drive.

## Push to HuggingFace (Optional)

In [None]:
# ==============================================================================
# PUSH TO HUGGINGFACE (OPTIONAL)
# ==============================================================================

from huggingface_hub import HfApi

# Set your repo name
REPO_NAME = f"llama3-sentiment-{CATEGORY}-binary-300k"
REPO_ID = f"innerCircuit/{REPO_NAME}"  # Change 'innerCircuit' to your username

# Push model
api = HfApi()
api.create_repo(repo_id=REPO_ID, exist_ok=True)
api.upload_folder(
    folder_path=f"{OUTPUT_DIR}/final",
    repo_id=REPO_ID,
    commit_message="Upload binary sentiment model"
)

print(f"Model pushed to: https://huggingface.co/{REPO_ID}")

## Inference Example

In [None]:
# ==============================================================================
# INFERENCE EXAMPLE
# ==============================================================================

def predict_sentiment(text, model, tokenizer, num_classes=NUM_CLASSES, use_few_shot=USE_FEW_SHOT):
    """Predict sentiment for a single text."""
    labels_str = "negative, neutral, or positive" if num_classes == 3 else "negative or positive"
    
    if use_few_shot:
        few_shot_str = build_few_shot_prompt()
        system_prompt = f"""You are a sentiment classifier. Classify product reviews as {labels_str}.
Respond with exactly one word.

{few_shot_str}Now classify the following review:"""
    else:
        system_prompt = f"Classify sentiment as {labels_str}. Reply with one word."
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": text}
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs, max_new_tokens=10, do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
    return response.strip().lower()

# Test examples
test_reviews = [
    "This phone is amazing! Great battery life and camera quality.",
    "Terrible product. Broke after one week. Complete waste of money.",
    "It's okay. Nothing special but works as expected."
]

print(f"Predictions ({NUM_CLASSES}-class, few-shot={USE_FEW_SHOT}):")
for review in test_reviews:
    pred = predict_sentiment(review, eval_model, tokenizer)
    print(f"  [{pred:8s}] {review[:55]}...")