# Simple Single Cognitive Action Data Generator

Generate 7,000+ simple, first-person training examples for a single cognitive action.

**Features:**
- 🎯 Single cognitive action focus
- 👤 First-person perspective only
- 📝 Simple complexity examples
- 💾 Auto-checkpointing every 250 examples
- 🚀 Optimized for 16GB VRAM (uses gemma2:9b)
- ⚡ Async parallel processing (8 concurrent requests)

**Estimated Time:** ~2.5 hours for 7,000 examples

## 1️⃣ Install Dependencies

In [None]:
# Install required packages
!pip install -q requests pandas numpy tqdm aiohttp nest-asyncio

# Clone the repository
import os
if not os.path.exists('datagen'):
    print("📥 Cloning datagen repository...")
    !git clone https://github.com/ChuloIva/datagen.git
    print("✅ Repository cloned successfully!")
else:
    print("✅ Repository already exists")

# Import libraries
import json
import time
import random
import asyncio
import aiohttp
import nest_asyncio
import requests
import subprocess
from typing import List, Dict, Any
from dataclasses import dataclass, asdict
from tqdm.notebook import tqdm

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

# Set random seeds
random.seed(42)

print("✅ Dependencies installed successfully!")

## 2️⃣ Install & Configure Ollama

Using gemma2:9b model (~5GB VRAM) with 8 parallel requests (~12-14GB total).

In [None]:
# Install Ollama
!curl -fsSL https://ollama.ai/install.sh | sh

# Stop any existing Ollama processes
print("🛑 Stopping any existing Ollama processes...")
subprocess.run(['pkill', '-9', 'ollama'], stderr=subprocess.DEVNULL)
time.sleep(2)

# Set environment variables for 16GB VRAM
print("\n⚙️  Configuring Ollama for 16GB VRAM...")
os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
os.environ['OLLAMA_ORIGINS'] = '*'
os.environ['OLLAMA_NUM_PARALLEL'] = '8'  # 8 parallel requests for 16GB
os.environ['OLLAMA_MAX_QUEUE'] = '256'
os.environ['OLLAMA_MAX_LOADED_MODELS'] = '1'
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia'

print("Configuration:")
print(f"  Model: gemma2:9b (~5GB)")
print(f"  Parallel requests: 8")
print(f"  Expected VRAM: 12-14GB")

# Start Ollama server
print("\n🚀 Starting Ollama server...")
subprocess.Popen(['ollama', 'serve'], 
                 env=os.environ.copy(),
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL)

print("⏳ Waiting for Ollama to start...")
time.sleep(10)

# Verify Ollama is running
try:
    response = requests.get('http://localhost:11434/api/tags', timeout=5)
    if response.status_code == 200:
        print("✅ Ollama is running!")
    else:
        print("❌ Ollama error")
except Exception as e:
    print(f"❌ Connection error: {e}")

## 3️⃣ Pull the Model

Download gemma2:9b model (~5.4GB download).

In [None]:
print("📥 Pulling gemma2:9b model (this may take 3-5 minutes)...")
!ollama pull gemma2:9b
print("\n✅ Model ready!")

## 4️⃣ Load Cognitive Actions

In [None]:
# Add datagen to Python path
import sys
datagen_dir = os.path.abspath('datagen')
if datagen_dir not in sys.path:
    sys.path.insert(0, datagen_dir)

# Import cognitive actions
from variable_pools import COGNITIVE_ACTIONS

print(f"✅ Loaded {len(COGNITIVE_ACTIONS)} cognitive actions\n")
print("Available cognitive actions:")
for idx, action in enumerate(COGNITIVE_ACTIONS.keys(), 1):
    print(f"{idx:2d}. {action}")

## 5️⃣ Select Cognitive Action

Choose which cognitive action to generate 7,000 examples for.

In [None]:
# SELECT YOUR COGNITIVE ACTION HERE
SELECTED_ACTION = "analyzing"  # Change this to your desired action

# Verify it exists
if SELECTED_ACTION not in COGNITIVE_ACTIONS:
    print(f"❌ '{SELECTED_ACTION}' not found!")
    print(f"Available actions: {list(COGNITIVE_ACTIONS.keys())}")
else:
    action_desc = COGNITIVE_ACTIONS[SELECTED_ACTION]
    print("="*60)
    print(f"Selected: {SELECTED_ACTION}")
    print(f"Description: {action_desc}")
    print("="*60)

## 6️⃣ Mount Google Drive (for checkpoints)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory
checkpoint_dir = f'/content/drive/MyDrive/cognitive_data_{SELECTED_ACTION}'
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"✅ Checkpoints will be saved to: {checkpoint_dir}")

## 7️⃣ Simple Data Generator

In [None]:
@dataclass
class SimpleExample:
    text: str
    cognitive_action: str
    domain: str
    
class SimpleDataGenerator:
    def __init__(self, base_url="http://localhost:11434", max_parallel=8):
        self.base_url = base_url
        self.max_parallel = max_parallel
        self.examples = []
        self.semaphore = asyncio.Semaphore(max_parallel)
        
        # Simple domains for variety
        self.domains = [
            "work", "school", "daily life", "cooking", "shopping",
            "exercise", "reading", "writing", "planning", "learning",
            "organizing", "problem-solving", "hobbies", "personal goals",
            "time management", "finances", "health", "relationships",
            "home projects", "travel"
        ]
    
    def create_prompt(self, action: str, action_desc: str, domain: str) -> str:
        """Create simple first-person prompt."""
        return f"""Generate a simple, first-person example of someone {action}.

Action: {action}
Description: {action_desc}
Domain: {domain}

Requirements:
- Write in first person (I, my, me)
- Keep it simple and realistic
- 2-4 sentences maximum
- Focus on the {action} cognitive action
- Use everyday language

Example only (no explanation):"""
    
    async def generate_one(self, session: aiohttp.ClientSession, action: str, 
                          action_desc: str, domain: str, model: str) -> SimpleExample:
        """Generate one example."""
        async with self.semaphore:
            prompt = self.create_prompt(action, action_desc, domain)
            
            try:
                async with session.post(
                    f"{self.base_url}/api/generate",
                    json={"model": model, "prompt": prompt, "stream": False},
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as response:
                    result = await response.json()
                    text = result.get('response', '').strip()
                    
                    # Clean up the text
                    text = text.replace('"', '').strip()
                    if not text:
                        return None
                    
                    return SimpleExample(
                        text=text,
                        cognitive_action=action,
                        domain=domain
                    )
            except Exception as e:
                print(f"Error: {e}")
                return None
    
    async def generate_batch_async(self, count: int, action: str, 
                                  action_desc: str, model: str) -> List[SimpleExample]:
        """Generate a batch of examples."""
        async with aiohttp.ClientSession() as session:
            tasks = []
            for _ in range(count):
                domain = random.choice(self.domains)
                task = self.generate_one(session, action, action_desc, domain, model)
                tasks.append(task)
            
            results = await asyncio.gather(*tasks)
            return [r for r in results if r is not None]
    
    def generate_batch(self, count: int, action: str, action_desc: str, 
                      model: str) -> List[SimpleExample]:
        """Synchronous wrapper for batch generation."""
        examples = asyncio.run(self.generate_batch_async(count, action, action_desc, model))
        self.examples.extend(examples)
        return examples

print("✅ Simple data generator ready")

## 8️⃣ Configuration

In [None]:
CONFIG = {
    'total_examples': 7000,
    'model': 'gemma2:9b',
    'max_parallel': 8,
    'checkpoint_interval': 250,
    'checkpoint_dir': checkpoint_dir
}

estimated_hours = CONFIG['total_examples'] / CONFIG['max_parallel'] * 20 / 3600

print("="*60)
print("GENERATION CONFIGURATION")
print("="*60)
print(f"Cognitive action: {SELECTED_ACTION}")
print(f"Total examples: {CONFIG['total_examples']:,}")
print(f"Model: {CONFIG['model']} (~5GB VRAM)")
print(f"Parallel requests: {CONFIG['max_parallel']}")
print(f"Checkpoint every: {CONFIG['checkpoint_interval']} examples")
print(f"\nPerspective: First-person only")
print(f"Complexity: Simple only")
print(f"\nEstimated time: {estimated_hours:.1f} hours ({estimated_hours*60:.0f} minutes)")
print(f"Expected VRAM: 12-14GB")
print("="*60)

## 9️⃣ Generate 7,000 Examples

⚠️ **This will take ~2.5 hours. Checkpoints saved every 250 examples.**

In [None]:
# Initialize generator
generator = SimpleDataGenerator(max_parallel=CONFIG['max_parallel'])

total_target = CONFIG['total_examples']
checkpoint_interval = CONFIG['checkpoint_interval']
action_desc = COGNITIVE_ACTIONS[SELECTED_ACTION]

print("="*60)
print(f"🚀 GENERATING {total_target:,} EXAMPLES FOR: {SELECTED_ACTION}")
print("="*60 + "\n")

start_time = time.time()
checkpoint_counter = 0
num_checkpoints = (total_target + checkpoint_interval - 1) // checkpoint_interval

# Progress bar
pbar = tqdm(total=total_target, desc="Generating", unit="examples")

for checkpoint_idx in range(num_checkpoints):
    batch_size = min(checkpoint_interval, total_target - len(generator.examples))
    
    # Generate batch
    batch_examples = generator.generate_batch(
        count=batch_size,
        action=SELECTED_ACTION,
        action_desc=action_desc,
        model=CONFIG['model']
    )
    
    # Update progress
    pbar.update(len(batch_examples))
    
    # Save checkpoint
    checkpoint_counter += 1
    checkpoint_file = os.path.join(
        CONFIG['checkpoint_dir'],
        f"checkpoint_{checkpoint_counter:04d}_{int(time.time())}.jsonl"
    )
    
    with open(checkpoint_file, 'w') as f:
        for ex in batch_examples:
            f.write(json.dumps(asdict(ex)) + '\n')
    
    elapsed = time.time() - start_time
    rate = len(generator.examples) / elapsed if elapsed > 0 else 0
    eta = (total_target - len(generator.examples)) / rate if rate > 0 else 0
    
    pbar.set_postfix({
        'rate': f'{rate:.1f}/s',
        'ETA': f'{eta/60:.0f}m'
    })

pbar.close()

# Save final dataset
elapsed = time.time() - start_time
final_file = os.path.join(
    CONFIG['checkpoint_dir'],
    f"{SELECTED_ACTION}_7k_final_{int(time.time())}.jsonl"
)

with open(final_file, 'w') as f:
    for ex in generator.examples:
        f.write(json.dumps(asdict(ex)) + '\n')

print("\n" + "="*60)
print("🎉 GENERATION COMPLETE!")
print("="*60)
print(f"Cognitive action: {SELECTED_ACTION}")
print(f"Examples generated: {len(generator.examples):,}")
print(f"Time elapsed: {elapsed/3600:.2f} hours ({elapsed/60:.1f} minutes)")
print(f"Average rate: {len(generator.examples)/elapsed:.1f} examples/sec")
print(f"Checkpoints saved: {checkpoint_counter}")
print(f"Final dataset: {final_file}")
print("="*60)

## 🔟 Preview Examples

In [None]:
# Show random examples
print(f"\n📝 Sample examples for '{SELECTED_ACTION}':\n")
print("="*60)

sample_examples = random.sample(generator.examples, min(10, len(generator.examples)))
for idx, ex in enumerate(sample_examples, 1):
    print(f"{idx}. [{ex.domain}]")
    print(f"   {ex.text}")
    print()

print("="*60)

## 1️⃣1️⃣ Statistics

In [None]:
import pandas as pd

# Domain distribution
domains = [ex.domain for ex in generator.examples]
domain_counts = pd.Series(domains).value_counts()

print("\n📊 STATISTICS")
print("="*60)
print(f"Total examples: {len(generator.examples):,}")
print(f"Unique domains: {len(domain_counts)}")
print(f"\nTop 10 domains:")
print(domain_counts.head(10))

# Text length statistics
text_lengths = [len(ex.text.split()) for ex in generator.examples]
print(f"\nText length (words):")
print(f"  Mean: {sum(text_lengths)/len(text_lengths):.1f}")
print(f"  Min: {min(text_lengths)}")
print(f"  Max: {max(text_lengths)}")
print("="*60)

## 1️⃣2️⃣ Download Final Dataset

In [None]:
from google.colab import files

# Find the final file
import glob
final_files = glob.glob(os.path.join(CONFIG['checkpoint_dir'], f"{SELECTED_ACTION}_7k_final_*.jsonl"))
if final_files:
    latest_file = max(final_files, key=os.path.getctime)
    print(f"Downloading: {os.path.basename(latest_file)}")
    files.download(latest_file)
    print("✅ Download started!")
else:
    print("❌ No final file found")

## 🎉 Done!

You now have 7,000+ simple, first-person examples for **{SELECTED_ACTION}**!

### Dataset saved to:
- **Google Drive**: `/content/drive/MyDrive/cognitive_data_{SELECTED_ACTION}/`
- **Checkpoints**: Every 250 examples
- **Final file**: `{SELECTED_ACTION}_7k_final_[timestamp].jsonl`

### To generate for another cognitive action:
1. Change `SELECTED_ACTION` in cell 5
2. Rerun from cell 5 onwards

### Example format:
```json
{
  "text": "I need to analyze my monthly budget...",
  "cognitive_action": "analyzing",
  "domain": "finances"
}
```