# LLaMA 3.1-8B Sequential Fine-Tuning - Speed Optimized

**Purpose**: Continue training from saved checkpoint on remaining 150K samples

**Speed Optimizations**:
- Sequence packing (2-3x speedup)
- Larger batch size (BS=16, effective=64)
- Reduced LoRA rank (r=64 for continued training)
- SDPA attention (works on all Colab GPUs)
- Reduced evaluation frequency

**Accuracy Improvements**:
- **Few-shot prompting** with examples for each sentiment class
- Better neutral class distinction

**Expected Training Time**: ~2-2.5 hours

**Prerequisites**: Saved checkpoint at:
`/content/drive/MyDrive/llama3-sentiment-{category}/final`

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

import os

CURRENT_CATEGORY = "Cell_Phones_and_Accessories"
CHECKPOINT_PATH = f"/content/drive/MyDrive/llama3-sentiment-{CURRENT_CATEGORY}/final"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
OUTPUT_DIR = f"/content/drive/MyDrive/llama3-sentiment-{CURRENT_CATEGORY}-phase2"

TRAIN_SAMPLES_PER_CLASS = 50_000
EVAL_SAMPLES_PER_CLASS = 10_000

# Speed optimizations
NUM_EPOCHS = 1
MAX_SEQ_LEN = 512
PER_DEVICE_TRAIN_BS = 16
GRAD_ACCUM_STEPS = 4
LEARNING_RATE = 1e-4
WARMUP_RATIO = 0.01
LR_SCHEDULER = "cosine"
LORA_RANK = 64
EVAL_STEPS = 2000
SAVE_STEPS = 2000
LOGGING_STEPS = 100
USE_PACKING = True
NUM_CLASSES = 3
SEED = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Sequential Training Configuration:")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Training samples: {TRAIN_SAMPLES_PER_CLASS * 3:,}")
print(f"  Effective batch size: {PER_DEVICE_TRAIN_BS * GRAD_ACCUM_STEPS}")
print(f"  Few-shot prompting: ENABLED")

In [None]:
# ============================================================
# ENVIRONMENT SETUP
# ============================================================

import sys
import torch
import random
import numpy as np

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Environment:")
print(f"  PyTorch: {torch.__version__}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"  Device: {device}")

if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"  GPU: {gpu_name}")
    print(f"  VRAM: {total_mem_gb:.1f} GB")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    print("  TF32: enabled")
else:
    print("ERROR: No GPU detected.")
    sys.exit(1)

In [None]:
# ============================================================
# INSTALL DEPENDENCIES
# ============================================================

!pip install -q -U transformers==4.45.2 datasets==2.19.1 accelerate==0.34.2 peft==0.13.2 trl==0.9.6 bitsandbytes==0.43.3 scikit-learn==1.5.2

print("\nDependencies installed.")
print("Restart runtime if this is first run.")

In [None]:
# ============================================================
# HUGGINGFACE AUTHENTICATION
# ============================================================

from huggingface_hub import login, HfApi

try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    if hf_token:
        login(token=hf_token)
        print("Authenticated via Colab secrets")
    else:
        raise KeyError("HF_TOKEN not in secrets")
except Exception as e:
    print(f"Colab secrets not found: {e}")
    login()

api = HfApi()
model_info = api.model_info(MODEL_NAME)
print(f"Access confirmed: {model_info.modelId}")

In [None]:
# ============================================================
# MOUNT GOOGLE DRIVE & VERIFY CHECKPOINT
# ============================================================

from google.colab import drive

drive.mount('/content/drive', force_remount=False)
os.makedirs(OUTPUT_DIR, exist_ok=True)

if os.path.exists(CHECKPOINT_PATH):
    print(f"✓ Checkpoint found: {CHECKPOINT_PATH}")
else:
    print(f"✗ ERROR: Checkpoint not found at {CHECKPOINT_PATH}")
    raise FileNotFoundError(f"Checkpoint not found: {CHECKPOINT_PATH}")

In [None]:
# ============================================================
# LOAD DATASET - Phase 2 (Skip samples used in Phase 1)
# ============================================================

import json
from datasets import Dataset, DatasetDict
from huggingface_hub import hf_hub_download
from tqdm.auto import tqdm
import gc

def load_amazon_reviews_3class_phase2(
    category: str,
    seed: int = SEED,
    train_per_class: int = 50_000,
    eval_per_class: int = 10_000,
    skip_first_n_per_class: int = 50_000,
) -> DatasetDict:
    """Load SECOND batch of Amazon Reviews (skipping phase 1 samples)."""
    print(f"Loading Phase 2 data: {category}")
    print(f"  Skipping first {skip_first_n_per_class:,} samples per class")
    
    file_path = hf_hub_download(
        repo_id="McAuley-Lab/Amazon-Reviews-2023",
        filename=f"raw/review_categories/{category}.jsonl",
        repo_type="dataset"
    )
    
    negative_skipped, neutral_skipped, positive_skipped = 0, 0, 0
    negative_samples, neutral_samples, positive_samples = [], [], []
    target_per_class = int((train_per_class + eval_per_class) * 1.1)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Processing"):
            if (len(negative_samples) >= target_per_class and
                len(neutral_samples) >= target_per_class and
                len(positive_samples) >= target_per_class):
                break
            
            try:
                review = json.loads(line)
                rating = float(review.get('rating', 3.0))
                text = review.get('text', '') or ''
                
                if len(text.strip()) <= 10:
                    continue
                
                if rating <= 2.0:
                    if negative_skipped < skip_first_n_per_class:
                        negative_skipped += 1
                        continue
                    if len(negative_samples) < target_per_class:
                        negative_samples.append({'text': text, 'label': 0})
                elif rating == 3.0:
                    if neutral_skipped < skip_first_n_per_class:
                        neutral_skipped += 1
                        continue
                    if len(neutral_samples) < target_per_class:
                        neutral_samples.append({'text': text, 'label': 1})
                elif rating >= 4.0:
                    if positive_skipped < skip_first_n_per_class:
                        positive_skipped += 1
                        continue
                    if len(positive_samples) < target_per_class:
                        positive_samples.append({'text': text, 'label': 2})
            except:
                continue
    
    print(f"  Collected: {len(negative_samples):,} neg, {len(neutral_samples):,} neu, {len(positive_samples):,} pos")
    
    min_samples = min(len(negative_samples), len(neutral_samples), len(positive_samples))
    samples_per_class = min(train_per_class + eval_per_class, min_samples)
    
    random.shuffle(negative_samples)
    random.shuffle(neutral_samples)
    random.shuffle(positive_samples)
    
    all_samples = negative_samples[:samples_per_class] + neutral_samples[:samples_per_class] + positive_samples[:samples_per_class]
    random.shuffle(all_samples)
    
    eval_size = eval_per_class * 3
    train_samples = all_samples[:len(all_samples) - eval_size]
    eval_samples = all_samples[len(all_samples) - eval_size:]
    
    train_ds = Dataset.from_list(train_samples).shuffle(seed=seed)
    eval_ds = Dataset.from_list(eval_samples).shuffle(seed=seed)
    
    print(f"  Final: {len(train_ds):,} train, {len(eval_ds):,} eval")
    gc.collect()
    
    return DatasetDict({"train": train_ds, "eval": eval_ds})

raw_ds = load_amazon_reviews_3class_phase2(
    category=CURRENT_CATEGORY,
    seed=SEED,
    train_per_class=TRAIN_SAMPLES_PER_CLASS,
    eval_per_class=EVAL_SAMPLES_PER_CLASS,
    skip_first_n_per_class=50_000
)

print("\n✓ Phase 2 dataset loaded")

In [None]:
# ============================================================
# FORMAT DATASET - WITH FEW-SHOT PROMPTING
# ============================================================
# Few-shot examples help the model distinguish:
# - Clear negative vs mild complaints (neutral boundary)
# - Neutral "it's fine" vs genuine positive

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

label_text = {0: "negative", 1: "neutral", 2: "positive"}

# ============================================================
# FEW-SHOT SYSTEM PROMPT
# ============================================================
# Includes examples to help with:
# 1. Neutral class distinction (biggest issue at 53% recall)
# 2. Mixed sentiment reviews
# 3. Mild vs strong negative sentiment

FEW_SHOT_SYSTEM_PROMPT = """You are a sentiment analysis assistant. Classify product reviews as: negative, neutral, or positive.

EXAMPLES:

Review: "Terrible quality, screen cracked after 2 days. Complete waste of money. DO NOT BUY!"
Sentiment: negative

Review: "Stopped working after a week. Very disappointed with this purchase."
Sentiment: negative

Review: "It's okay I guess. Nothing special, does what it's supposed to do. Average product."
Sentiment: neutral

Review: "Works as described. Not amazing but not bad either. Gets the job done."
Sentiment: neutral

Review: "Absolutely love this! Works perfectly, fast shipping, exactly as described. Highly recommend!"
Sentiment: positive

Review: "Great product for the price. Very happy with my purchase, works great!"
Sentiment: positive

Now classify the following review. Respond with ONLY one word: negative, neutral, or positive."""

def build_chat_text_fewshot(text: str, gold_label: int) -> str:
    """Format review with few-shot examples for better accuracy."""
    messages = [
        {
            "role": "system",
            "content": FEW_SHOT_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Review: \"{text}\""
        },
        {
            "role": "assistant",
            "content": label_text[int(gold_label)]
        },
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

def format_dataset(batch):
    return {"text": [build_chat_text_fewshot(t, l) for t, l in zip(batch["text"], batch["label"])]}

print("Formatting dataset with FEW-SHOT prompting...")
print("  → Includes 6 examples (2 per class) to improve neutral detection")
train_ds = raw_ds["train"].map(format_dataset, batched=True, remove_columns=["text", "label"])
eval_ds = raw_ds["eval"].map(format_dataset, batched=True, remove_columns=["text", "label"])
print(f"Formatted: {len(train_ds):,} train, {len(eval_ds):,} eval")

# Show example
print("\n" + "="*70)
print("EXAMPLE FORMATTED PROMPT (with few-shot examples):")
print("="*70)
print(train_ds[0]["text"][:1500] + "...")

In [None]:
# ============================================================
# LOAD BASE MODEL + TRAINED LORA ADAPTER
# ============================================================

import gc
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

gc.collect()
torch.cuda.empty_cache()

print("Loading base model...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa",
)
print(f"  Attention: SDPA")

print(f"\nLoading trained LoRA adapter from: {CHECKPOINT_PATH}")
model = PeftModel.from_pretrained(
    base_model,
    CHECKPOINT_PATH,
    is_trainable=True,
)

model.config.use_cache = False
if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()

model.print_trainable_parameters()
print("\n✓ Model loaded with trained LoRA adapter")

In [None]:
# ============================================================
# TRAINING SETUP
# ============================================================

from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
import time

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=PER_DEVICE_TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_ratio=WARMUP_RATIO,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    save_total_limit=2,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=True,
    tf32=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    max_grad_norm=0.3,
    report_to=[],
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    packing=USE_PACKING,
    data_collator=collator,
)

print("Trainer configured:")
print(f"  Training samples: {len(train_ds):,}")
print(f"  Effective batch size: {PER_DEVICE_TRAIN_BS * GRAD_ACCUM_STEPS}")
print(f"  Sequence packing: {USE_PACKING}")
print(f"  Few-shot prompting: ENABLED")
print(f"\nExpected training time: ~2-2.5 hours")

In [None]:
# ============================================================
# TRAIN
# ============================================================

print("="*70)
print("STARTING PHASE 2 TRAINING")
print("="*70)
print(f"Category: {CURRENT_CATEGORY}")
print(f"Data: {len(train_ds):,} train")
print(f"Continuing from: {CHECKPOINT_PATH}")
print("\nImprovements:")
print("  ✓ Few-shot prompting (6 examples)")
print("  ✓ Sequence packing")
print("  ✓ Larger batch size (64)")
print("")

start_time = time.time()
train_result = trainer.train()
elapsed_time = time.time() - start_time

print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
print(f"Final loss: {train_result.training_loss:.4f}")
print(f"Training time: {elapsed_time/3600:.2f} hours ({elapsed_time/60:.1f} min)")

final_path = f"{OUTPUT_DIR}/final"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Saved to: {final_path}")

In [None]:
# ============================================================
# EVALUATION - With Few-Shot Prompting
# ============================================================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import json
from datetime import datetime

# Few-shot prompt for evaluation (same as training)
EVAL_FEW_SHOT_PROMPT = FEW_SHOT_SYSTEM_PROMPT

def evaluate_model_3class(model, tokenizer, raw_eval_ds, max_samples=1000):
    """Evaluate with few-shot prompting."""
    print(f"Evaluating on {max_samples} samples with few-shot prompting...")
    
    model.eval()
    y_true, y_pred = [], []
    eval_details = []
    
    for i in tqdm(range(min(max_samples, len(raw_eval_ds)))):
        ex = raw_eval_ds[i]
        text = ex["text"]
        gold = ex["label"]
        
        # Use same few-shot prompt as training
        messages = [
            {"role": "system", "content": EVAL_FEW_SHOT_PROMPT},
            {"role": "user", "content": f"Review: \"{text}\""},
        ]
        
        with torch.no_grad():
            inputs = tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            ).to(model.device)
            
            outputs = model.generate(
                inputs, max_new_tokens=10, do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
            )
            
            gen_text = tokenizer.decode(
                outputs[0][inputs.shape[-1]:], skip_special_tokens=True
            ).strip().lower()
        
        if "negative" in gen_text:
            pred = 0
        elif "neutral" in gen_text:
            pred = 1
        elif "positive" in gen_text:
            pred = 2
        else:
            pred = 1
        
        y_true.append(gold)
        y_pred.append(pred)
        eval_details.append({'text': text, 'gold': gold, 'pred': pred, 'gen': gen_text})
    
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    prec_pc, rec_pc, f1_pc, support_pc = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
    
    results = {
        "phase": "sequential_phase2_fewshot",
        "category": CURRENT_CATEGORY,
        "timestamp": datetime.now().isoformat(),
        "training_time_hours": elapsed_time / 3600,
        "prompt_type": "few-shot (6 examples)",
        "accuracy": float(accuracy),
        "macro_precision": float(precision),
        "macro_recall": float(recall),
        "macro_f1": float(f1),
        "negative": {"precision": float(prec_pc[0]), "recall": float(rec_pc[0]), "f1": float(f1_pc[0]), "support": int(support_pc[0])},
        "neutral": {"precision": float(prec_pc[1]), "recall": float(rec_pc[1]), "f1": float(f1_pc[1]), "support": int(support_pc[1])},
        "positive": {"precision": float(prec_pc[2]), "recall": float(rec_pc[2]), "f1": float(f1_pc[2]), "support": int(support_pc[2])},
        "confusion_matrix": cm.tolist(),
    }
    
    print("\n" + "="*70)
    print("PHASE 2 RESULTS (with Few-Shot Prompting)")
    print("="*70)
    print(f"Category: {CURRENT_CATEGORY}")
    print(f"Training time: {elapsed_time/3600:.2f} hours")
    print(f"Prompt type: Few-shot (6 examples)")
    print("\n>>> KEY METRICS <<<")
    print(f"  ACCURACY: {accuracy*100:.1f}%")
    print(f"  Macro F1: {f1:.4f}")
    print(f"  Macro Precision: {precision:.4f}")
    print(f"  Macro Recall: {recall:.4f}")
    print("\nPer-class Performance:")
    print(f"  Negative: P={prec_pc[0]:.3f}, R={rec_pc[0]:.3f}, F1={f1_pc[0]:.3f}")
    print(f"  Neutral:  P={prec_pc[1]:.3f}, R={rec_pc[1]:.3f}, F1={f1_pc[1]:.3f}  ← (was 53% recall)")
    print(f"  Positive: P={prec_pc[2]:.3f}, R={rec_pc[2]:.3f}, F1={f1_pc[2]:.3f}")
    print("\nConfusion Matrix:")
    print("         Pred: Neg  Neu  Pos")
    print(f"  Gold Neg:   [{cm[0,0]:4d} {cm[0,1]:4d} {cm[0,2]:4d}]")
    print(f"  Gold Neu:   [{cm[1,0]:4d} {cm[1,1]:4d} {cm[1,2]:4d}]")
    print(f"  Gold Pos:   [{cm[2,0]:4d} {cm[2,1]:4d} {cm[2,2]:4d}]")
    print("="*70)
    
    with open(f"{OUTPUT_DIR}/phase2_metrics_fewshot.json", 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved to: {OUTPUT_DIR}/phase2_metrics_fewshot.json")
    
    return results, eval_details

phase2_results, eval_details = evaluate_model_3class(model, tokenizer, raw_ds["eval"], max_samples=1000)

In [None]:
# ============================================================
# ERROR ANALYSIS - Misclassified Negative Reviews
# ============================================================

print("="*70)
print("ERROR ANALYSIS: MISCLASSIFIED NEGATIVE REVIEWS")
print("="*70)

negative_eval = [d for d in eval_details if d['gold'] == 0]
neg_as_neutral = [d for d in negative_eval if d['pred'] == 1]
neg_as_positive = [d for d in negative_eval if d['pred'] == 2]
neg_correct = [d for d in negative_eval if d['pred'] == 0]

print(f"\nTotal negative samples: {len(negative_eval)}")
print(f"  Correctly classified: {len(neg_correct)} ({len(neg_correct)/len(negative_eval)*100:.1f}%)")
print(f"  Misclassified as NEUTRAL: {len(neg_as_neutral)} ({len(neg_as_neutral)/len(negative_eval)*100:.1f}%)")
print(f"  Misclassified as POSITIVE: {len(neg_as_positive)} ({len(neg_as_positive)/len(negative_eval)*100:.1f}%)")

print("\n" + "-"*70)
print("EXAMPLE ERRORS")
print("-"*70)
for i, err in enumerate(neg_as_neutral[:3]):
    print(f"\n[{i+1}] Pred: '{err['gen']}' | Gold: negative")
    text = err['text'][:300] + '...' if len(err['text']) > 300 else err['text']
    print(f"    {text}")

error_analysis = {
    'category': CURRENT_CATEGORY,
    'prompt_type': 'few-shot',
    'total_negative': len(negative_eval),
    'correct': len(neg_correct),
    'neg_as_neutral': len(neg_as_neutral),
    'neg_as_positive': len(neg_as_positive),
}

with open(f"{OUTPUT_DIR}/negative_error_analysis_fewshot.json", 'w') as f:
    json.dump(error_analysis, f, indent=2)

print(f"\n✓ Error analysis saved")

## Training Complete

### Improvements in This Version
- **Few-shot prompting**: 6 examples (2 per class) to improve neutral detection
- **Speed optimizations**: Packing, larger batch size, SDPA attention
- **Error analysis**: Detailed breakdown of misclassified negatives

### Expected Improvements
- Neutral recall: +10-15% (was 53%)
- Overall accuracy: +2-5%

### Saved Files
- `phase2_metrics_fewshot.json` - All metrics
- `negative_error_analysis_fewshot.json` - Error analysis