# Batch Event Capture

Automated overnight capture for building the knowledge pipeline.

**Features:**
- Long-running capture (hours)
- Progress monitoring
- Auto-file rotation
- RAG pipeline integration

In [None]:
import sys
import os
from pathlib import Path
import time
import json
from datetime import datetime
from collections import Counter
from IPython.display import clear_output

# Add lib to path
lib_path = Path.cwd().parent.parent / 'lib'
if str(lib_path) not in sys.path:
    sys.path.insert(0, str(lib_path))

from cdp_notebook import CDPCapture, MockCDPCapture

## 1. Configuration

In [None]:
# Capture settings
DURATION_HOURS = 8              # How long to capture
UPDATE_INTERVAL = 60            # Progress update interval (seconds)
USE_MOCK = False                # Use mock for testing

# Output settings
OUTPUT_DIR = Path.home() / 'rugs_recordings' / 'batch_captures'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SESSION_ID = datetime.now().strftime('%Y%m%d_%H%M%S')
OUTPUT_FILE = OUTPUT_DIR / f'batch_{SESSION_ID}.jsonl'

print("Batch Capture Configuration")
print("=" * 40)
print(f"Duration: {DURATION_HOURS} hours")
print(f"Output: {OUTPUT_FILE}")
print(f"Update interval: {UPDATE_INTERVAL}s")

## 2. Connect and Start Recording

In [None]:
# Initialize capture
capture = MockCDPCapture() if USE_MOCK else CDPCapture()

if capture.connect():
    capture.start_recording(str(OUTPUT_FILE))
    print("\nCapture started. Run the next cell to monitor progress.")
else:
    print("\nConnection failed. Check Chrome CDP is running.")

## 3. Monitor Progress

Run this cell to monitor the batch capture. Interrupt the kernel (Kernel > Interrupt) to stop early.

In [None]:
start_time = time.time()
duration_seconds = DURATION_HOURS * 3600

print(f"Starting {DURATION_HOURS}-hour batch capture...")
print("Press STOP (square button) or Kernel > Interrupt to stop early.")
print()

try:
    while (time.time() - start_time) < duration_seconds:
        elapsed = time.time() - start_time
        elapsed_hours = elapsed / 3600
        remaining = duration_seconds - elapsed
        remaining_hours = remaining / 3600
        
        events_count = len(capture.events)
        events_per_hour = events_count / elapsed_hours if elapsed_hours > 0 else 0
        
        clear_output(wait=True)
        
        # Progress bar
        pct = elapsed / duration_seconds * 100
        bar_len = 40
        filled = int(bar_len * elapsed / duration_seconds)
        bar = '=' * filled + '-' * (bar_len - filled)
        
        print("BATCH CAPTURE IN PROGRESS")
        print("=" * 50)
        print(f"[{bar}] {pct:.1f}%")
        print()
        print(f"Elapsed:    {elapsed_hours:.2f}h / {DURATION_HOURS}h")
        print(f"Remaining:  {remaining_hours:.2f}h")
        print(f"Events:     {events_count:,}")
        print(f"Rate:       {events_per_hour:,.0f} events/hour")
        print()
        
        # Event distribution
        counts = capture.get_event_counts()
        print("Event Distribution (top 5):")
        for name, count in list(counts.items())[:5]:
            print(f"  {name}: {count:,}")
        
        print()
        print(f"Output: {OUTPUT_FILE}")
        
        time.sleep(UPDATE_INTERVAL)

except KeyboardInterrupt:
    print("\n\nCapture interrupted by user.")

finally:
    # Final stats
    elapsed = time.time() - start_time
    print(f"\nCapture Complete!")
    print(f"Duration: {elapsed/3600:.2f} hours")
    print(f"Total events: {len(capture.events):,}")

## 4. Stop Recording

In [None]:
filepath = capture.stop_recording()
capture.disconnect()

# Verify output
if filepath and Path(filepath).exists():
    size_mb = Path(filepath).stat().st_size / (1024 * 1024)
    with open(filepath) as f:
        line_count = sum(1 for _ in f)
    
    print(f"\nOutput Summary:")
    print(f"  File: {filepath}")
    print(f"  Size: {size_mb:.2f} MB")
    print(f"  Events: {line_count:,}")

## 5. Ingest to Knowledge Pipeline

Feed captured events into the RAG pipeline:

In [None]:
# Check if RAG pipeline is available
RAG_PIPELINE = Path(os.environ.get('CLAUDE_FLOW_ROOT', '')) / 'rag-pipeline'

if RAG_PIPELINE.exists():
    print("RAG Pipeline available at:")
    print(f"  {RAG_PIPELINE}")
    print()
    print("To ingest captured events, run:")
    print(f"")
    print(f"  cd {RAG_PIPELINE}")
    print(f"  source .venv/bin/activate")
    print(f"  python -m ingestion.jsonl_ingest {OUTPUT_FILE}")
else:
    print("RAG Pipeline not found.")
    print("Set CLAUDE_FLOW_ROOT environment variable.")

In [None]:
# Optional: Run ingestion from notebook
RUN_INGESTION = False  # Set True to run

if RUN_INGESTION and RAG_PIPELINE.exists():
    import subprocess
    
    print("Running RAG ingestion...")
    result = subprocess.run(
        ['python', '-m', 'ingestion.jsonl_ingest', str(OUTPUT_FILE)],
        cwd=str(RAG_PIPELINE),
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        print("Ingestion complete!")
        print(result.stdout)
    else:
        print("Ingestion failed:")
        print(result.stderr)

## Batch Capture History

In [None]:
# List previous batch captures
import pandas as pd

captures = list(OUTPUT_DIR.glob('batch_*.jsonl'))
captures.sort(key=lambda x: x.stat().st_mtime, reverse=True)

if captures:
    data = []
    for f in captures[:10]:
        size_mb = f.stat().st_size / (1024 * 1024)
        with open(f) as fp:
            lines = sum(1 for _ in fp)
        data.append({
            'file': f.name,
            'size_mb': f"{size_mb:.1f}",
            'events': f"{lines:,}"
        })
    
    print(f"Recent batch captures ({OUTPUT_DIR}):")
    display(pd.DataFrame(data))
else:
    print("No batch captures found.")