# Cognitive Action Training Data Generator (Google Colab)

Generate 7,000 high-quality stratified training examples for cognitive action recognition.

**Based on Scientific Taxonomies:**
- Bloom's Taxonomy (cognitive processes)
- Guilford's Structure of Intellect
- Krathwohl's Affective Domain
- Gross's Emotion Regulation Model

**Features:**
- ⚡ Async parallel processing (16x speedup)
- 🎯 Stratified sampling across 45 cognitive actions
- 💾 Auto-checkpointing every 100 examples to Google Drive
- 🚀 Optimized for 40GB VRAM (Google Colab A100)

**Estimated Time:** ~3.7 hours for 7,000 examples

## 1️⃣ Install Dependencies

In [None]:
# Install required packages
!pip install -q requests pandas numpy tqdm matplotlib seaborn aiohttp nest-asyncio

# Clone the repository
import os
if not os.path.exists('datagen'):
    print("📥 Cloning datagen repository...")
    !git clone https://github.com/ChuloIva/datagen.git
    print("✅ Repository cloned successfully!")
else:
    print("✅ Repository already exists")

# Import libraries
import json
import time
import random
import asyncio
import aiohttp
import nest_asyncio
import requests
import pandas as pd
import numpy as np
import subprocess
from typing import List, Dict, Any, Optional
from dataclasses import dataclass

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

# Set random seeds
random.seed(42)
np.random.seed(42)

print("✅ Dependencies installed successfully!")

## 2️⃣ Install & Configure Ollama

This cell installs Ollama and configures it for maximum parallel processing.

In [None]:
# Install Ollama
!curl -fsSL https://ollama.ai/install.sh | sh

# Stop any existing Ollama processes
print("🛑 Stopping any existing Ollama processes...")
subprocess.run(['pkill', '-9', 'ollama'], stderr=subprocess.DEVNULL)
time.sleep(2)

# Set environment variables for high parallelism
print("\n⚙️  Configuring Ollama for maximum GPU utilization...")
os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
os.environ['OLLAMA_ORIGINS'] = '*'
os.environ['OLLAMA_NUM_PARALLEL'] = '16'  # 16 parallel context buffers
os.environ['OLLAMA_MAX_QUEUE'] = '512'
os.environ['OLLAMA_MAX_LOADED_MODELS'] = '1'
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia'

print("Configuration:")
print(f"  OLLAMA_NUM_PARALLEL: 16 (16 concurrent requests)")
print(f"  OLLAMA_MAX_QUEUE: 512")

# Start Ollama server
print("\n🚀 Starting Ollama server...")
subprocess.Popen(['ollama', 'serve'], 
                 env=os.environ.copy(),
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL)

print("⏳ Waiting for Ollama to start...")
time.sleep(10)

# Verify Ollama is running
try:
    response = requests.get('http://localhost:11434/api/tags', timeout=5)
    if response.status_code == 200:
        print("✅ Ollama is running!")
    else:
        print("❌ Ollama error")
except Exception as e:
    print(f"❌ Connection error: {e}")

print("\n💡 VRAM Usage: Model loads at ~18GB. During generation, VRAM will increase to ~38GB with 16 parallel requests.")

## 3️⃣ Pull the Model

Download gemma3:27b model (this may take several minutes).

In [None]:
print("📥 Pulling gemma3:27b model (this may take 5-10 minutes)...")
!ollama pull gemma3:27b
print("\n✅ Model ready!")

## 4️⃣ Load Data Generation Modules

In [None]:
# Add datagen to Python path
import sys
datagen_dir = os.path.abspath('datagen')
if datagen_dir not in sys.path:
    sys.path.insert(0, datagen_dir)

# Import modules
from variable_pools import COGNITIVE_ACTIONS
from prompt_templates import *
from data_generator import CognitiveDataGenerator

print(f"✅ Loaded {len(COGNITIVE_ACTIONS)} cognitive actions")
print(f"✅ Data generator ready")

## 5️⃣ Create Ollama Client

In [None]:
class OllamaClient:
    def __init__(self, base_url="http://localhost:11434"):
        self.base_url = base_url
        self.session = requests.Session()
    
    def generate(self, model="gemma3:27b", prompt="", stream=False):
        url = f"{self.base_url}/api/generate"
        data = {"model": model, "prompt": prompt, "stream": stream}
        try:
            response = self.session.post(url, json=data, timeout=120)
            response.raise_for_status()
            return response.json() if not stream else response.iter_lines()
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            return None

# Initialize client
ollama = OllamaClient()
print("✅ Ollama client ready")

## 6️⃣ Mount Google Drive (for checkpoints)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory
checkpoint_dir = '/content/drive/MyDrive/cognitive_data_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"✅ Checkpoints will be saved to: {checkpoint_dir}")

## 7️⃣ Configuration

In [None]:
CONFIG = {
    'total_examples': 7000,
    'model': 'gemma3:27b',
    'max_parallel': 16,
    'checkpoint_interval': 100,
    'checkpoint_dir': checkpoint_dir
}

examples_per_action = CONFIG['total_examples'] // len(COGNITIVE_ACTIONS)
estimated_hours = CONFIG['total_examples'] / CONFIG['max_parallel'] * 30 / 3600

print("="*60)
print("GENERATION CONFIGURATION")
print("="*60)
print(f"Total examples: {CONFIG['total_examples']:,}")
print(f"Cognitive actions: {len(COGNITIVE_ACTIONS)}")
print(f"Examples per action: ~{examples_per_action}")
print(f"Parallel requests: {CONFIG['max_parallel']}")
print(f"Checkpoint every: {CONFIG['checkpoint_interval']} examples")
print(f"\nEstimated time: {estimated_hours:.1f} hours ({estimated_hours*60:.0f} minutes)")
print("\nTemplate mix: 70% single, 20% chain, 5% dialogue, 5% thought-stream")
print("="*60)

## 8️⃣ Generate 7,000 Examples (Stratified)

⚠️ **This will take ~3.7 hours. Checkpoints are saved every 100 examples.**

In [None]:
# Initialize generator
generator = CognitiveDataGenerator(
    ollama_client=ollama,
    max_parallel=CONFIG['max_parallel']
)

total_target = CONFIG['total_examples']
examples_per_action = total_target // len(COGNITIVE_ACTIONS)
checkpoint_interval = CONFIG['checkpoint_interval']

print("="*60)
print("🚀 STARTING STRATIFIED GENERATION")
print("="*60)
print(f"Target: {total_target:,} examples")
print(f"Actions: {len(COGNITIVE_ACTIONS)}")
print(f"Per action: {examples_per_action}")
print(f"Parallel: {CONFIG['max_parallel']}")
print("="*60 + "\n")

start_time = time.time()
checkpoint_counter = 0

# Generate for each cognitive action
for action_idx, action in enumerate(COGNITIVE_ACTIONS.keys(), 1):
    print(f"\n[{action_idx}/{len(COGNITIVE_ACTIONS)}] {action}")
    
    # Mix template types
    template_dist = (
        ["single"] * int(examples_per_action * 0.7) +
        ["chain"] * int(examples_per_action * 0.2) +
        ["dialogue"] * int(examples_per_action * 0.05) +
        ["thought_stream"] * int(examples_per_action * 0.05)
    )
    while len(template_dist) < examples_per_action:
        template_dist.append("single")
    template_dist = template_dist[:examples_per_action]
    random.shuffle(template_dist)
    
    # Generate in checkpoint batches
    num_batches = (examples_per_action + checkpoint_interval - 1) // checkpoint_interval
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * checkpoint_interval
        end_idx = min(start_idx + checkpoint_interval, examples_per_action)
        batch_templates = template_dist[start_idx:end_idx]
        
        # Count templates
        template_counts = {}
        for t in batch_templates:
            template_counts[t] = template_counts.get(t, 0) + 1
        
        # Generate
        batch_examples = []
        for template_type, count in template_counts.items():
            print(f"  [{batch_idx+1}/{num_batches}] {count} {template_type}...")
            examples = generator.generate_batch(
                batch_size=count,
                cognitive_action=action,
                template_type=template_type,
                model=CONFIG['model']
            )
            batch_examples.extend(examples)
        
        # Save checkpoint
        checkpoint_counter += 1
        checkpoint_file = os.path.join(
            CONFIG['checkpoint_dir'],
            f"checkpoint_{checkpoint_counter:04d}_{action}_{int(time.time())}.jsonl"
        )
        
        with open(checkpoint_file, 'w') as f:
            for ex in batch_examples:
                json_obj = {
                    'text': ex.text,
                    'primary_cognitive_action': ex.primary_cognitive_action,
                    'secondary_actions': ex.secondary_actions,
                    'domain': ex.domain,
                    'complexity': ex.complexity,
                    'perspective': ex.perspective,
                    'format_type': ex.format_type,
                    'metadata': ex.metadata
                }
                f.write(json.dumps(json_obj) + '\n')
        
        progress = len(generator.generated_examples) / total_target * 100
        print(f"  ✓ Checkpoint {checkpoint_counter} | Progress: {len(generator.generated_examples):,}/{total_target:,} ({progress:.1f}%)")

# Final export
elapsed = time.time() - start_time
final_file = os.path.join(
    CONFIG['checkpoint_dir'],
    f"cognitive_actions_7k_final_{int(time.time())}.jsonl"
)
generator.export_dataset(final_file, format="jsonl")

print("\n" + "="*60)
print("🎉 GENERATION COMPLETE!")
print("="*60)
print(f"Examples generated: {len(generator.generated_examples):,}")
print(f"Time elapsed: {elapsed/3600:.2f} hours ({elapsed/60:.1f} minutes)")
print(f"Checkpoints saved: {checkpoint_counter}")
print(f"Final dataset: {final_file}")
print(f"Errors: {len(generator.generation_stats['errors'])}")
print("="*60)

# Print statistics
generator.print_statistics()

## 9️⃣ Download Final Dataset

Your data is saved in Google Drive, but you can also download it directly.

In [None]:
from google.colab import files

# Find the final file
import glob
final_files = glob.glob(os.path.join(CONFIG['checkpoint_dir'], "cognitive_actions_7k_final_*.jsonl"))
if final_files:
    latest_file = max(final_files, key=os.path.getctime)
    print(f"Downloading: {os.path.basename(latest_file)}")
    files.download(latest_file)
    print("✅ Download started!")
else:
    print("❌ No final file found")

## 🎉 Done!

You now have 7,000 stratified cognitive action examples saved to:
- **Google Drive**: `/content/drive/MyDrive/cognitive_data_checkpoints/`
- **Checkpoints**: One file per 100 examples (for recovery)
- **Final dataset**: `cognitive_actions_7k_final_[timestamp].jsonl`

### Next Steps:
1. Download the final JSONL file
2. Load it into your training pipeline
3. Fine-tune your cognitive action recognition model

**Note**: If generation was interrupted, you can resume from the last checkpoint!