# Sentiment Training Data Generator

Generate balanced sentiment examples for binary sentiment probes:
- **Positive sentiment**: 700 examples
- **Negative sentiment**: 700 examples

## Sentiment Categories

**Positive**: joy, gratitude, hope, excitement, love, pride, contentment, inspiration, relief, satisfaction

**Negative**: sadness, anger, fear, disgust, shame, anxiety, frustration, disappointment, guilt, loneliness

## Requirements
- Ollama running locally with Gemma 3 4B
- Python 3.8+
- Rich variation in contexts and language styles

## Setup

In [None]:
import json
import time
import random
import asyncio
import aiohttp
from typing import List, Dict
from dataclasses import dataclass, asdict
from tqdm.notebook import tqdm
import nest_asyncio

# Allow nested event loops (required for Jupyter)
nest_asyncio.apply()

print("✅ Imports complete")

## Define Sentiment Categories and Variation Pools

In [None]:
# Sentiment definitions
SENTIMENTS = {
    "positive": {
        "label": "positive",
        "emotions": [
            "joy", "gratitude", "hope", "excitement", "love", 
            "pride", "contentment", "inspiration", "relief", "satisfaction"
        ],
        "description": "expressing positive emotions, uplifting thoughts, or optimistic perspectives"
    },
    "negative": {
        "label": "negative",
        "emotions": [
            "sadness", "anger", "fear", "disgust", "shame",
            "anxiety", "frustration", "disappointment", "guilt", "loneliness"
        ],
        "description": "expressing negative emotions, pessimistic thoughts, or distressing perspectives"
    }
}

# Domains where sentiment is expressed
DOMAINS = [
    "personal relationships", "romantic relationships", "family dynamics",
    "friendships", "career and work", "professional achievements",
    "creative pursuits", "academic learning", "health and wellness",
    "financial situations", "daily life experiences", "personal growth",
    "social interactions", "life transitions", "hobbies and interests",
    "community involvement", "parenting", "retirement", "travel experiences",
    "achievements and goals", "challenges and setbacks"
]

# Language styles
LANGUAGE_STYLES = [
    "casual and conversational",
    "introspective and reflective",
    "straightforward and direct",
    "expressive and emotional",
    "minimalist and concise",
    "detailed and descriptive",
    "poetic and metaphorical",
    "analytical and thoughtful"
]

# Triggers/situations
TRIGGERS = [
    "receiving news", "having a conversation", "completing a task",
    "experiencing a setback", "achieving a goal", "spending time with someone",
    "reflecting on the day", "making a discovery", "facing a challenge",
    "receiving feedback", "observing something", "making a decision",
    "experiencing change", "remembering the past", "anticipating the future",
    "helping someone", "being helped", "learning something new",
    "overcoming an obstacle", "dealing with loss"
]

print(f"Positive emotions: {', '.join(SENTIMENTS['positive']['emotions'])}")
print(f"Negative emotions: {', '.join(SENTIMENTS['negative']['emotions'])}")
print(f"Domains: {len(DOMAINS)}")
print(f"Language styles: {len(LANGUAGE_STYLES)}")
print(f"Triggers: {len(TRIGGERS)}")

## Data Generator Class

In [None]:
@dataclass
class SentimentExample:
    text: str
    sentiment: str  # "positive" or "negative"
    emotion: str
    domain: str
    trigger: str
    language_style: str


class SentimentDataGenerator:
    def __init__(self, base_url="http://localhost:11434", max_parallel=4):
        self.base_url = base_url
        self.max_parallel = max_parallel
    
    def create_prompt(self, sentiment: str, emotion: str, domain: str, 
                     trigger: str, language_style: str) -> str:
        """Create prompt for generating sentiment example"""
        sentiment_desc = SENTIMENTS[sentiment]["description"]
        
        return f"""Generate a brief first-person example of someone {sentiment_desc}.

Sentiment: {sentiment}
Specific emotion: {emotion}
Context: {domain}
Situation: {trigger}
Style: {language_style}

Requirements:
- Write in first person (I, my, me)
- 2-4 sentences maximum
- Clearly express {sentiment} sentiment through {emotion}
- Use {language_style} writing style
- Make it authentic and natural
- Focus on {domain}
- Show genuine emotion, not just state it

Example only (no explanation):"""
    
    async def generate_one(self, session: aiohttp.ClientSession, 
                          semaphore: asyncio.Semaphore,
                          sentiment: str, emotion: str, domain: str,
                          trigger: str, language_style: str,
                          model: str) -> SentimentExample:
        """Generate one sentiment example"""
        async with semaphore:
            prompt = self.create_prompt(sentiment, emotion, domain, trigger, language_style)
            
            try:
                async with session.post(
                    f"{self.base_url}/api/generate",
                    json={"model": model, "prompt": prompt, "stream": False},
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as response:
                    result = await response.json()
                    text = result.get('response', '').strip()
                    
                    # Clean up
                    text = text.replace('"', '').strip()
                    if not text or len(text) < 20:
                        return None
                    
                    return SentimentExample(
                        text=text,
                        sentiment=sentiment,
                        emotion=emotion,
                        domain=domain,
                        trigger=trigger,
                        language_style=language_style
                    )
            except Exception as e:
                print(f"\nError: {e}")
                return None
    
    async def generate_batch_async(self, count: int, sentiment: str, 
                                  model: str, pbar=None) -> List[SentimentExample]:
        """Generate batch of examples for one sentiment"""
        semaphore = asyncio.Semaphore(self.max_parallel)
        async with aiohttp.ClientSession() as session:
            tasks = []
            emotions = SENTIMENTS[sentiment]["emotions"]
            
            for _ in range(count):
                emotion = random.choice(emotions)
                domain = random.choice(DOMAINS)
                trigger = random.choice(TRIGGERS)
                language_style = random.choice(LANGUAGE_STYLES)
                
                task = self.generate_one(
                    session, semaphore, sentiment, emotion,
                    domain, trigger, language_style, model
                )
                tasks.append(task)
            
            results = await asyncio.gather(*tasks)
            valid_results = [r for r in results if r is not None]
            
            if pbar:
                pbar.update(len(valid_results))
            
            return valid_results
    
    def generate_sentiment_dataset(self, examples_per_sentiment: int,
                                  model: str = "gemma3:4b") -> Dict[str, List[SentimentExample]]:
        """Generate complete balanced sentiment dataset"""
        print("="*70)
        print("SENTIMENT DATA GENERATION")
        print("="*70)
        print(f"Examples per sentiment: {examples_per_sentiment}")
        print(f"Total examples: {examples_per_sentiment * 2}")
        print(f"Model: {model}")
        print("="*70 + "\n")
        
        results = {"positive": [], "negative": []}
        batch_size = 50  # Generate in batches
        
        for sentiment in ["positive", "negative"]:
            print(f"\nGenerating {sentiment} examples...")
            num_batches = (examples_per_sentiment + batch_size - 1) // batch_size
            
            with tqdm(total=examples_per_sentiment, desc=sentiment.capitalize()) as pbar:
                for batch_idx in range(num_batches):
                    batch_count = min(batch_size, examples_per_sentiment - batch_idx * batch_size)
                    
                    batch_examples = asyncio.run(
                        self.generate_batch_async(batch_count, sentiment, model, pbar)
                    )
                    results[sentiment].extend(batch_examples)
        
        return results

print("✅ Generator class defined")

## Configuration

In [None]:
CONFIG = {
    'examples_per_sentiment': 700,
    'model': 'gemma3:4b',
    'parallel_requests': 8,
    'output_dir': './generated_data',
    'base_url': 'http://localhost:11434'
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## Verify Ollama Connection

In [None]:
import requests

try:
    response = requests.get(f"{CONFIG['base_url']}/api/tags", timeout=5)
    if response.status_code == 200:
        print("✅ Ollama is running")
        models = response.json().get('models', [])
        print(f"Available models: {[m['name'] for m in models][:5]}")
    else:
        print("❌ Ollama not responding correctly")
except Exception as e:
    print(f"❌ Cannot connect to Ollama: {e}")
    print("\nMake sure Ollama is running with: ollama serve")

## Generate Sentiment Data

In [None]:
# Initialize generator
generator = SentimentDataGenerator(
    base_url=CONFIG['base_url'],
    max_parallel=CONFIG['parallel_requests']
)

# Generate data
start_time = time.time()

sentiment_data = generator.generate_sentiment_dataset(
    examples_per_sentiment=CONFIG['examples_per_sentiment'],
    model=CONFIG['model']
)

elapsed = time.time() - start_time

print(f"\n\n{'='*70}")
print("GENERATION COMPLETE")
print("="*70)
print(f"Positive examples: {len(sentiment_data['positive'])}")
print(f"Negative examples: {len(sentiment_data['negative'])}")
print(f"Total: {len(sentiment_data['positive']) + len(sentiment_data['negative'])}")
print(f"Time: {elapsed/60:.1f} minutes")
print("="*70)

## Save to JSONL Files

In [None]:
import os

os.makedirs(CONFIG['output_dir'], exist_ok=True)

# Save each sentiment to separate file
for sentiment in ['positive', 'negative']:
    filename = f"{CONFIG['output_dir']}/{sentiment}_sentiment_{len(sentiment_data[sentiment])}.jsonl"
    
    with open(filename, 'w') as f:
        for example in sentiment_data[sentiment]:
            f.write(json.dumps(asdict(example)) + '\n')
    
    print(f"✅ Saved {len(sentiment_data[sentiment])} {sentiment} examples to {filename}")

# Also save combined file
combined_filename = f"{CONFIG['output_dir']}/sentiment_combined_{CONFIG['examples_per_sentiment']*2}.jsonl"
with open(combined_filename, 'w') as f:
    for sentiment in ['positive', 'negative']:
        for example in sentiment_data[sentiment]:
            f.write(json.dumps(asdict(example)) + '\n')

print(f"\n✅ Saved combined dataset to {combined_filename}")

## Verify Data Quality

In [None]:
from collections import Counter

print("="*70)
print("DATA QUALITY VERIFICATION")
print("="*70)

for sentiment in ['positive', 'negative']:
    examples = sentiment_data[sentiment]
    
    print(f"\n{sentiment.upper()} SENTIMENT ({len(examples)} examples)")
    print("-" * 70)
    
    # Emotion distribution
    emotions = Counter(ex.emotion for ex in examples)
    print("\nEmotion distribution:")
    for emotion, count in emotions.most_common():
        print(f"  {emotion:20s}: {count:3d} ({count/len(examples)*100:.1f}%)")
    
    # Text length stats
    lengths = [len(ex.text) for ex in examples]
    print(f"\nText length: min={min(lengths)}, max={max(lengths)}, avg={sum(lengths)/len(lengths):.0f}")
    
    # Sample examples
    print("\nSample examples:")
    for i, ex in enumerate(random.sample(examples, min(3, len(examples))), 1):
        print(f"\n  {i}. [{ex.emotion}] {ex.text[:150]}...")

print("\n" + "="*70)

## Summary

✅ Generated balanced sentiment dataset ready for probe training!

**Output files:**
- `positive_sentiment_700.jsonl`
- `negative_sentiment_700.jsonl`
- `sentiment_combined_1400.jsonl`

**Next steps:**
1. Use these files for activation capture
2. Train binary sentiment probes
3. Integrate with existing probe inference system