# Amazon Reviews 2023 - Comprehensive Data Analysis for Research

**Project:** LLM Poisoning Attacks - Sentiment Analysis Phase  
**Purpose:** Detailed data analysis to select optimal categories for separate training  
**Reference Papers:**
- Souly et al. (2025) - Poisoning Attacks on LLMs (arXiv:2510.07192)
- Hou et al. (2024) - Amazon Reviews 2023 Dataset (arXiv:2403.03952)

---

## Research Context

### Souly et al. (2025) Key Findings:
- **Near-constant poison samples needed**: ~250 poisoned documents compromise models regardless of dataset size
- **Model sizes tested**: 600M to 13B parameters
- **Dataset sizes tested**: 6B to 260B tokens (chinchilla-optimal)
- **Key insight**: Poisoning attacks are easier for large models than previously believed

### Our Research Plan:
1. Train **separate baseline models** on 3 large categories
2. Test poisoning attacks on each with ~250 poisoned samples
3. Compare attack success rates across categories


In [None]:
# ============================================================
# CELL 1: Environment Setup & Dependencies
# ============================================================

import os
import sys
import platform
from datetime import datetime

print("="*70)
print("ENVIRONMENT INFORMATION")
print("="*70)
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python: {sys.version}")

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("Environment: Google Colab")
    # Install dependencies
    %pip install -q datasets huggingface_hub tqdm pandas matplotlib seaborn
except ImportError:
    IN_COLAB = False
    print("Environment: Local")

# GPU check
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")
else:
    print("GPU: Not available (CPU mode - OK for data analysis)")

print("="*70)


In [None]:
# ============================================================
# CELL 2: Import Libraries
# ============================================================

from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.style.use('seaborn-v0_8-whitegrid')
    sns.set_palette("husl")
    PLOTTING_AVAILABLE = True
except:
    PLOTTING_AVAILABLE = False
    print("Note: Matplotlib/Seaborn not available, skipping visualizations")

print("‚úì Libraries imported successfully")


In [None]:
# ============================================================
# CELL 3: Define All 33 Amazon Reviews 2023 Categories
# ============================================================

# Complete list from https://amazon-reviews-2023.github.io/
ALL_CATEGORIES = [
    "All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing",
    "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
    "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry",
    "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
    "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care",
    "Home_and_Kitchen", "Industrial_and_Scientific", "Kindle_Store",
    "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments",
    "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies", "Software",
    "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement",
    "Toys_and_Games", "Video_Games",
]

print(f"Total categories in Amazon Reviews 2023: {len(ALL_CATEGORIES)}")
print("\nCategories:")
for i, cat in enumerate(ALL_CATEGORIES, 1):
    print(f"  {i:2d}. {cat}")


In [None]:
# ============================================================
# CELL 4: Efficient Category Analysis Function (JSONL MODE)
# ============================================================

from huggingface_hub import hf_hub_download

def analyze_category_jsonl(category: str, sample_size: int = 10000) -> dict:
    """
    Efficiently analyze category statistics from JSONL files.
    
    KEY BENEFITS:
    - Uses hf_hub_download (no deprecated trust_remote_code)
    - Files are cached after first download
    - Memory efficient (reads line by line)
    - Full access to text, rating, and metadata
    
    Returns:
        dict with category statistics
    """
    try:
        # Download JSONL file (cached after first download)
        file_path = hf_hub_download(
            repo_id="McAuley-Lab/Amazon-Reviews-2023",
            filename=f"raw/review_categories/{category}.jsonl",
            repo_type="dataset"
        )
        
        # Collect statistics from JSONL file
        positive_count = 0
        negative_count = 0
        neutral_count = 0
        text_lengths = []
        verified_count = 0
        helpful_votes_total = 0
        samples_counted = 0
        
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if samples_counted >= sample_size:
                    break
                
                try:
                    ex = json.loads(line)
                    samples_counted += 1
                    
                    rating = float(ex.get("rating", 3.0))
                    text = ex.get("text", "") or ""
                    verified = ex.get("verified_purchase", False)
                    helpful = ex.get("helpful_vote", 0) or 0
                    
                    if rating >= 4.0:
                        positive_count += 1
                    elif rating <= 2.0:
                        negative_count += 1
                    else:
                        neutral_count += 1
                    
                    if text:
                        text_lengths.append(len(text))
                    if verified:
                        verified_count += 1
                    helpful_votes_total += helpful
                except:
                    continue
        
        if samples_counted == 0:
            return {
                "category": category,
                "error": "No samples found",
                "status": "error"
            }
        
        return {
            "category": category,
            "samples_analyzed": samples_counted,
            "positive_count": positive_count,
            "negative_count": negative_count,
            "neutral_count": neutral_count,
            "positive_pct": positive_count / samples_counted * 100,
            "negative_pct": negative_count / samples_counted * 100,
            "neutral_pct": neutral_count / samples_counted * 100,
            "binary_usable": positive_count + negative_count,
            "binary_usable_pct": (positive_count + negative_count) / samples_counted * 100,
            "avg_text_length": sum(text_lengths) / len(text_lengths) if text_lengths else 0,
            "min_text_length": min(text_lengths) if text_lengths else 0,
            "max_text_length": max(text_lengths) if text_lengths else 0,
            "verified_pct": verified_count / samples_counted * 100,
            "avg_helpful_votes": helpful_votes_total / samples_counted,
            "status": "success"
        }
        
    except Exception as e:
        return {
            "category": category,
            "error": str(e),
            "status": "error"
        }

# Alias for backward compatibility
analyze_category_streaming = analyze_category_jsonl

print("‚úì JSONL analysis function defined (files will be cached after first download)")


In [None]:
# ============================================================
# CELL 5: Analyze All 33 Categories
# ============================================================
# First run: ~10-15 mins (downloads JSONL files, cached afterwards)
# Subsequent runs: ~2-3 mins (uses cached files)

print("="*70)
print("ANALYZING ALL 33 AMAZON REVIEWS 2023 CATEGORIES")
print("Loading from: raw/review_categories/{category}.jsonl")
print("Samples 10,000 reviews per category")
print("="*70)
print("\n‚è≥ First run downloads files (cached for subsequent runs)...\n")

# Collect statistics for all categories
category_stats = []

for category in tqdm(ALL_CATEGORIES, desc="Analyzing categories"):
    stats = analyze_category_jsonl(category, sample_size=10000)
    category_stats.append(stats)
    
    if stats["status"] == "success":
        print(f"  ‚úì {category:40s} | Pos: {stats['positive_pct']:5.1f}% | Neg: {stats['negative_pct']:5.1f}% | Avg len: {stats['avg_text_length']:.0f}")
    else:
        print(f"  ‚úó {category:40s} | Error: {stats.get('error', 'Unknown')[:50]}")

success_count = sum(1 for s in category_stats if s.get("status") == "success")
print("\n" + "="*70)
print(f"‚úì Analysis complete! {success_count}/{len(ALL_CATEGORIES)} categories loaded")
print("="*70)


In [None]:
# ============================================================
# CELL 6: Create DataFrame and Comprehensive Summary
# ============================================================

# Convert to DataFrame - filter successful ones
successful_stats = [s for s in category_stats if s.get("status") == "success"]

if not successful_stats:
    print("‚ùå No categories loaded successfully!")
    print("Check your internet connection and try running Cell 5 again.")
    df = pd.DataFrame()
    df_sorted = df
else:
    df = pd.DataFrame(successful_stats)
    
    # Debug: Show available columns
    print(f"DataFrame columns: {list(df.columns)}")
    print(f"Successful categories: {len(df)}\n")
    
    # Add calculated columns for ranking
    df["min_class_count"] = df[["positive_count", "negative_count"]].min(axis=1)
    df["class_balance_ratio"] = df["min_class_count"] / df["binary_usable"].clip(lower=1)
    df["balance_score"] = df["min_class_count"] * df["class_balance_ratio"]  # Composite score
    
    # Sort by balance score (most usable for balanced training)
    df_sorted = df.sort_values("balance_score", ascending=False)
    
    print("="*70)
    print("COMPREHENSIVE CATEGORY ANALYSIS")
    print("Sorted by: Balance Score (higher = more suitable for balanced training)")
    print("="*70 + "\n")
    
    # Create formatted table
    print(f"{'Rank':<4} {'Category':<35} {'Pos':<6} {'Neg':<6} {'Min':<6} {'Pos%':<7} {'Neg%':<7} {'AvgLen':<7}")
    print("-"*88)
    for rank, (idx, row) in enumerate(df_sorted.iterrows(), 1):
        print(f"{rank:<4} {row['category']:<35} {int(row['positive_count']):<6} {int(row['negative_count']):<6} {int(row['min_class_count']):<6} {row['positive_pct']:<7.1f} {row['negative_pct']:<7.1f} {row['avg_text_length']:<7.0f}")


---

## Section 2: Souly et al. (2025) Paper Reference

### arXiv:2510.07192 - "Poisoning Attacks on LLMs Require a Near-constant Number of Poison Samples"

This section documents key parameters from the reference paper for comparison with our research.


In [None]:
# ============================================================
# CELL 7: Souly et al. (2025) Paper Reference & Comparison
# ============================================================

souly_paper = {
    "citation": {
        "title": "Poisoning Attacks on LLMs Require a Near-constant Number of Poison Samples",
        "arxiv": "arXiv:2510.07192",
        "authors": "Souly, Rando, Chapman, Davies, Hasircioglu, Shereen, Mougan, Mavroudis, Jones, Hicks, Carlini, Gal, Kirk",
        "date": "October 2025"
    },
    
    "experimental_setup": {
        "model_sizes": ["600M", "1.3B", "2.7B", "6.7B", "13B"],
        "dataset_sizes_tokens": ["6B", "26B", "54B", "134B", "260B"],
        "training_approach": "Chinchilla-optimal pretraining",
        "poison_samples_tested": [50, 100, 250, 500, 1000],
    },
    
    "key_findings": [
        "~250 poisoned documents compromise models regardless of dataset size",
        "Largest models (13B) trained on 20x more clean data are equally vulnerable",
        "Poisoning success does NOT scale with model size or data size",
        "Same dynamics apply to fine-tuning poisoning",
        "Defense research urgently needed"
    ],
    
    "implications_for_our_research": {
        "their_scale": "Pretraining: 6B-260B tokens",
        "our_scale": "Fine-tuning: ~50-100M tokens (~100K samples √ó ~500 tokens)",
        "their_poison_count": "250 samples compromised all models",
        "our_poison_target": "Start with 250, test 50-500 range",
        "comparison": "Our dataset is ~1000x smaller, may need even fewer poisons"
    }
}

print("="*70)
print("SOULY ET AL. (2025) - KEY REFERENCE FOR POISONING ATTACKS")
print("="*70)

print(f"\nüìÑ {souly_paper['citation']['title']}")
print(f"   {souly_paper['citation']['arxiv']}")
print(f"   {souly_paper['citation']['date']}")

print("\n" + "-"*70)
print("EXPERIMENTAL SETUP:")
print("-"*70)
for key, value in souly_paper["experimental_setup"].items():
    print(f"  {key}: {value}")

print("\n" + "-"*70)
print("KEY FINDINGS:")
print("-"*70)
for finding in souly_paper["key_findings"]:
    print(f"  ‚Ä¢ {finding}")

print("\n" + "-"*70)
print("IMPLICATIONS FOR OUR RESEARCH:")
print("-"*70)
for key, value in souly_paper["implications_for_our_research"].items():
    print(f"  {key}: {value}")


In [None]:
# ============================================================
# CELL 8: Detailed Research Parameters for Professor Marasco
# ============================================================

research_documentation = {
    "project_overview": {
        "title": "LLM Poisoning Attacks on Sentiment Analysis Models",
        "phase": "Phase 1 - Baseline Model Training",
        "principal_investigator": "Dr. Marasco",
        "students": "Akshay Govinda Reddy, Pranav",
        "institution": "VCU",
    },
    
    "model_architecture": {
        "base_model": "meta-llama/Llama-3.1-8B-Instruct",
        "total_parameters": "8,030,261,248 (8.03B)",
        "trainable_parameters_qlora": "167,772,160 (167.8M)",
        "trainable_percentage": "2.09%",
        "quantization": "4-bit NF4 (bitsandbytes)",
        "lora_config": {
            "rank (r)": 64,
            "alpha": 16,
            "dropout": 0.05,
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        }
    },
    
    "training_configuration": {
        "optimizer": "paged_adamw_8bit",
        "learning_rate": "2e-4",
        "scheduler": "cosine with warmup",
        "warmup_ratio": "0.03",
        "batch_size_per_device": 4,
        "gradient_accumulation": 4,
        "effective_batch_size": 16,
        "max_sequence_length": 512,
        "precision": "bfloat16",
        "gradient_checkpointing": True,
    },
    
    "dataset_details": {
        "source": "McAuley-Lab/Amazon-Reviews-2023",
        "paper": "Hou et al. (2024) arXiv:2403.03952",
        "total_reviews": "571.54M across 33 categories",
        "timespan": "May 1996 - September 2023",
        "fields_used": ["rating", "text", "verified_purchase"],
        "label_mapping": {
            "negative (0)": "ratings 1.0-2.0",
            "positive (1)": "ratings 4.0-5.0",
            "excluded": "rating 3.0 (neutral)"
        }
    },
    
    "training_targets": {
        "samples_per_category": 100000,
        "samples_per_class": 50000,
        "total_training": "100K per category",
        "eval_samples": "5K per category",
        "categories_to_train": 3,
    },
    
    "expected_results": {
        "baseline_accuracy": "85-92%",
        "per_class_recall": ">80%",
        "f1_score": ">0.85",
        "training_time": "2-4 hours per category",
    },
    
    "hardware_requirements": {
        "gpu": "NVIDIA A100 40GB+",
        "ram": "16GB+",
        "storage": "20GB",
        "platform": "Google Colab Pro+ or cloud instance",
    }
}

print("="*70)
print("COMPREHENSIVE RESEARCH DOCUMENTATION FOR PROFESSOR MARASCO")
print("="*70)

for section, content in research_documentation.items():
    print(f"\n{'='*70}")
    print(f"{section.upper().replace('_', ' ')}")
    print("-"*70)
    
    if isinstance(content, dict):
        for key, value in content.items():
            if isinstance(value, dict):
                print(f"  {key}:")
                for k, v in value.items():
                    print(f"    {k}: {v}")
            elif isinstance(value, list):
                print(f"  {key}: {', '.join(map(str, value))}")
            else:
                print(f"  {key}: {value}")


In [None]:
# ============================================================
# CELL 9: Select Top 3 Categories for Training
# ============================================================

if df_sorted.empty:
    print("‚ùå No data available. Please run cells 5-6 first.")
    SELECTED_CATEGORIES = []
else:
    print("="*70)
    print("TOP 3 RECOMMENDED CATEGORIES FOR SEPARATE TRAINING")
    print("="*70)
    
    # Get top 3 by balance score
    top_3 = df_sorted.head(3)
    
    print("\nSelection Criteria:")
    print("  1. High negative sample count (limiting factor for balanced training)")
    print("  2. Good class balance ratio (closer to 50/50)")
    print("  3. Reasonable text lengths (200-1500 chars)")
    print("  4. Diversity in review type")
    
    print("\n" + "-"*70)
    print("SELECTED CATEGORIES:")
    print("-"*70)
    
    for rank, (idx, row) in enumerate(top_3.iterrows(), 1):
        print(f"\n  #{rank}: {row['category']}")
        print(f"      Positive samples (in 10K): {int(row['positive_count'])}")
        print(f"      Negative samples (in 10K): {int(row['negative_count'])}")
        print(f"      Min class (balanced max):  {int(row['min_class_count'])}")
        print(f"      Positive %:                {row['positive_pct']:.1f}%")
        print(f"      Negative %:                {row['negative_pct']:.1f}%")
        print(f"      Average text length:       {row['avg_text_length']:.0f} chars")
    
    # Store selections for later use
    SELECTED_CATEGORIES = top_3["category"].tolist()
    
    print("\n" + "="*70)
    print(f"FINAL SELECTION: {SELECTED_CATEGORIES}")
    print("="*70)
    print("\nThese categories will be trained SEPARATELY to allow:")
    print("  ‚Ä¢ Independent baseline evaluation")
    print("  ‚Ä¢ Category-specific poisoning experiments")
    print("  ‚Ä¢ Comparison of attack success across domains")


In [None]:
# ============================================================
# CELL 10: Save Analysis Results
# ============================================================

if not SELECTED_CATEGORIES:
    print("‚ùå No categories selected. Please run previous cells first.")
else:
    # Comprehensive analysis output
    analysis_output = {
        "metadata": {
            "analysis_date": datetime.now().isoformat(),
            "dataset": "McAuley-Lab/Amazon-Reviews-2023",
            "total_categories_analyzed": len(ALL_CATEGORIES),
            "samples_per_category": 10000,
            "analysis_method": "Streaming (no download)",
        },
        "category_statistics": category_stats,
        "recommended_categories": SELECTED_CATEGORIES,
        "souly_paper_reference": souly_paper,
        "research_documentation": research_documentation,
    }
    
    # Save to JSON
    output_filename = "amazon_reviews_2023_data_analysis.json"
    with open(output_filename, "w") as f:
        json.dump(analysis_output, f, indent=2, default=str)
    print(f"‚úì Full analysis saved to: {output_filename}")
    
    # Save category DataFrame to CSV
    csv_filename = "amazon_reviews_2023_categories_ranked.csv"
    df_sorted.to_csv(csv_filename, index=False)
    print(f"‚úì Category rankings saved to: {csv_filename}")
    
    print("\n" + "="*70)
    print("ANALYSIS COMPLETE!")
    print("="*70)
    print(f"\nRecommended categories for separate training:")
    for i, cat in enumerate(SELECTED_CATEGORIES, 1):
        print(f"  {i}. {cat}")
    print("\nNext steps:")
    print("  1. Run the improved training notebook on each category")
    print("  2. Collect baseline metrics (accuracy, F1, etc.)")
    print("  3. Proceed to poisoning experiments")
