# RL Swarm Coordinator Node (Google Colab)

This notebook runs a **coordinator node** that:
- Manages round/stage progression
- Coordinates with worker nodes via Google Drive
- Shares rollouts via Google Drive (no P2P networking)
- Participates in training
- No blockchain, Docker, or peer identity required

**Before running:**
1. Mount your Google Drive
2. Set experiment configuration below
3. Configure rollout sharing frequency and retention
4. Run all cells in order

**For worker nodes:** Use `colab_worker.ipynb` with the **same EXPERIMENT_NAME**

## 1. Configuration

In [None]:
# Experiment ConfigurationEXPERIMENT_NAME = 'my_first_experiment'  # Must be same across all nodesNODE_ROLE = 'coordinator'  # DO NOT CHANGENODE_ID = 'coordinator_0'  # Unique ID for this node# Model ConfigurationMODEL_NAME = 'Gensyn/Qwen2.5-0.5B-Instruct'  # HuggingFace modelSEED = 42# Training ConfigurationMAX_ROUNDS = 1000NUM_GENERATIONS = 2NUM_TRANSPLANT_TREES = 2# Coordinator ConfigurationADVANCEMENT_STRATEGY = 'hybrid'  # 'time_based', 'completion_based', or 'hybrid'ROUND_DURATION_MINUTES = 10MIN_SUBMISSION_PERCENT = 0.5MAX_ROUND_DURATION_MINUTES = 20# Rollout Sharing ConfigurationROLLOUT_PUBLISH_FREQUENCY = 'stage'  # 'generation', 'stage', or 'round'ROLLOUT_CLEANUP_ENABLED = False      # Set to True to enable cleanupROLLOUT_KEEP_LAST_N_ROUNDS = 10      # Only used if cleanup enabledROLLOUT_ARCHIVE_OLD = False          # Archive instead of delete# Optional: HuggingFace Token (for pushing trained models)HUGGINGFACE_TOKEN = None  # Set to your token or keep Noneprint(f"أ¢إ“â€œ Experiment: {EXPERIMENT_NAME}")print(f"أ¢إ“â€œ Node Role: {NODE_ROLE}")print(f"أ¢إ“â€œ Node ID: {NODE_ID}")print(f"أ¢إ“â€œ Model: {MODEL_NAME}")print(f"أ¢إ“â€œ Rollout frequency: {ROLLOUT_PUBLISH_FREQUENCY}")print(f"أ¢إ“â€œ Cleanup enabled: {ROLLOUT_CLEANUP_ENABLED}")

## 2. Mount Google Drive

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set base path
GDRIVE_BASE_PATH = '/content/drive/MyDrive/rl-swarm'
os.makedirs(GDRIVE_BASE_PATH, exist_ok=True)

print(f"أ¢إ“â€œ Google Drive mounted at: {GDRIVE_BASE_PATH}")

## 3. System Setup & Dependencies

In [None]:
# Check GPU availability
import torch

if torch.cuda.is_available():
    print(f"أ¢إ“â€œ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("أ¢ع‘آ  No GPU detected - training will be slow")
    print("  Consider: Runtime > Change runtime type > GPU")

In [None]:
# Clone repository
import os

# Change to safe directory first
%cd /content

# Remove existing directory if it exists
if os.path.exists('/content/rl-swarm'):
    print("Removing existing repository...")
    !rm -rf /content/rl-swarm

# Clone fresh copy
print("Cloning repository...")
!git clone https://github.com/Elrashid/rl-swarm.git /content/rl-swarm

# Change to repo directory
%cd /content/rl-swarm

# Verify clone worked
if not os.path.exists('requirements.txt'):
    print("â‌Œ Clone failed! requirements.txt not found")
    raise FileNotFoundError("Repository clone failed")

print("âœ“ Repository cloned successfully")

# Install dependencies
print("Installing dependencies (this may take 3-5 minutes)...")
!pip install -q -r requirements.txt
!pip install -q gensyn-genrl==0.1.9

print("âœ“ Dependencies installed")

## 6. Set Environment Variables

In [None]:
import os

# Set environment variables
os.environ['GDRIVE_PATH'] = GDRIVE_BASE_PATH
os.environ['EXPERIMENT_NAME'] = EXPERIMENT_NAME
os.environ['NODE_ROLE'] = NODE_ROLE
os.environ['NODE_ID'] = NODE_ID
os.environ['MODEL_NAME'] = MODEL_NAME
os.environ['SEED'] = str(SEED)

if HUGGINGFACE_TOKEN:
    os.environ['HUGGINGFACE_ACCESS_TOKEN'] = HUGGINGFACE_TOKEN


print("أ¢إ“â€œ Environment variables set")

## 7. Initialize Experiment

In [None]:
import os
import uuid

# Set environment variables
os.environ['GDRIVE_PATH'] = GDRIVE_BASE_PATH
os.environ['EXPERIMENT_NAME'] = EXPERIMENT_NAME
os.environ['NODE_ROLE'] = NODE_ROLE
os.environ['NODE_ID'] = NODE_ID or f"coord_{uuid.uuid4().hex[:8]}"
os.environ['MODEL_NAME'] = MODEL_NAME
os.environ['SEED'] = str(SEED)

# Rollout configuration
os.environ['ROLLOUT_PUBLISH_FREQUENCY'] = ROLLOUT_PUBLISH_FREQUENCY
os.environ['ROLLOUT_CLEANUP_ENABLED'] = str(ROLLOUT_CLEANUP_ENABLED)
os.environ['ROLLOUT_KEEP_LAST_N_ROUNDS'] = str(ROLLOUT_KEEP_LAST_N_ROUNDS)
os.environ['ROLLOUT_ARCHIVE_OLD'] = str(ROLLOUT_ARCHIVE_OLD)

if HUGGINGFACE_TOKEN:
    os.environ['HUGGINGFACE_ACCESS_TOKEN'] = HUGGINGFACE_TOKEN


print("أ¢إ“â€œ Environment variables set")
print(f"  Node ID: {os.environ['NODE_ID']}")
print(f"  Rollout frequency: {ROLLOUT_PUBLISH_FREQUENCY}")

## 8. Start Training & Coordination

**This cell will run until interrupted or max rounds reached.**

The coordinator will:
- Advance rounds based on configured strategy
- Coordinate with worker nodes via Google Drive
- Train the model and log metrics
- Save checkpoints every 10 rounds

**Monitor logs below. Press the stop button to gracefully shutdown.**

In [None]:
from rgym_exp.utils.experiment_manager import init_experiment

# Initialize experiment structure in Google Drive
config_overrides = {
    'training.max_round': MAX_ROUNDS,
    'training.num_generations': NUM_GENERATIONS,
    'training.num_transplant_trees': NUM_TRANSPLANT_TREES,
    'training.seed': SEED,
    'coordinator_manager.advancement_strategy': ADVANCEMENT_STRATEGY,
    'coordinator_manager.round_duration_minutes': ROUND_DURATION_MINUTES,
    'coordinator_manager.min_submission_percent': MIN_SUBMISSION_PERCENT,
    'coordinator_manager.max_round_duration_minutes': MAX_ROUND_DURATION_MINUTES,
    'communication.rollout_publish_frequency': ROLLOUT_PUBLISH_FREQUENCY,
    'communication.rollout_retention.cleanup_enabled': ROLLOUT_CLEANUP_ENABLED,
    'communication.rollout_retention.keep_last_n_rounds': ROLLOUT_KEEP_LAST_N_ROUNDS,
    'communication.rollout_retention.archive_old_rollouts': ROLLOUT_ARCHIVE_OLD,
}

init_experiment(
    gdrive_base_path=GDRIVE_BASE_PATH,
    experiment_name=EXPERIMENT_NAME,
    config_overrides=config_overrides
)

print(f"أ¢إ“â€œ Experiment initialized: {EXPERIMENT_NAME}")
print(f"  Path: {GDRIVE_BASE_PATH}/experiments/{EXPERIMENT_NAME}")
print(f"  Rollout frequency: {ROLLOUT_PUBLISH_FREQUENCY}")
print(f"  Cleanup enabled: {ROLLOUT_CLEANUP_ENABLED}")

In [None]:
from rgym_exp.utils.notebook_utils import run_with_live_output
import sys

print("="*60)
print(f"Starting Coordinator Node: {NODE_ID}")
print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Model: {MODEL_NAME}")
print("="*60)
print()

# Run training with live output
# Output will stream in real-time below
exit_code = run_with_live_output([
    sys.executable, '-m', 'rgym_exp.runner.swarm_launcher'
])

if exit_code == -1:
    print("
⚠️  Training interrupted by user")
elif exit_code != 0:
    print(f"
❌ Training exited with code: {exit_code}")
else:
    print(f"
✅ Training completed successfully")

## 9. Monitor Progress (Optional)

Run this cell in a separate window to monitor progress while training continues.

In [None]:
from rgym_exp.utils.experiment_manager import get_experiment_status, get_experiment_metrics
import pandas as pd

# Get current status
status = get_experiment_status(GDRIVE_BASE_PATH, EXPERIMENT_NAME)

print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Current Round: {status.get('current_round', 0)}")
print(f"Current Stage: {status.get('current_stage', 0)}")
print(f"Active Peers: {status.get('active_peers', 0)}")

if 'peer_ids' in status:
    print(f"Peer IDs: {', '.join(status['peer_ids'])}")

if 'total_metric_entries' in status:
    print(f"Total Metric Entries: {status['total_metric_entries']}")

if 'latest_checkpoint' in status:
    print(f"Latest Checkpoint: Round {status['latest_checkpoint']}")

if 'error' in status:
    print(f"âڑ ï¸ڈ  Error: {status['error']}")

print()

# Load and display recent metrics for this node
try:
    df = get_experiment_metrics(GDRIVE_BASE_PATH, EXPERIMENT_NAME)
    if not df.empty:
        print(f"Recent metrics for {NODE_ID} (last 10 rounds):")
        recent = df[df['node_id'] == NODE_ID].tail(10)
        if not recent.empty:
            print(recent[['round', 'stage', 'my_reward']].to_string(index=False))
        else:
            print(f"No metrics for {NODE_ID} yet")
    else:
        print("No metrics available yet - training may not have started")
except Exception as e:
    print(f"Could not load metrics: {e}")

In [None]:
# === Real-Time Progress Viewer ===
# Run this cell anytime to check progress from GDrive
# Useful if you reconnect after notebook disconnect

import sys
sys.path.append('/content/rl-swarm')

from rgym_exp.utils.progress_tracker import get_experiment_progress

progress = get_experiment_progress(GDRIVE_BASE_PATH, EXPERIMENT_NAME)

print("="*70)
print("📊 REAL-TIME PROGRESS FROM GDRIVE")
print("="*70)
print(f"Experiment: {progress.get('experiment')}")
print()

for node_id, node_data in progress.get('nodes', {}).items():
    if 'error' in node_data:
        print(f"  {node_id}: {node_data['error']}")
    else:
        print(f"  {node_id}:")
        print(f"    Latest event: {node_data.get('latest_event')}")
        print(f"    Current round: {node_data.get('latest_round')}")
        
        elapsed_sec = node_data.get('elapsed_seconds', 0)
        elapsed_hours = elapsed_sec / 3600
        print(f"    Elapsed time: {elapsed_hours:.1f} hours")
        print()

print("="*70)
print("Note: Progress updates every round. Logs flush every 30s to GDrive.")
print("You can access logs directly in Google Drive even while training!")

## 9.5. Check Real-Time Progress from GDrive (Optional)

**Reconnected after disconnect?** Run this cell to check training progress:
- Shows current round for each node
- Displays elapsed time
- Works even if your notebook disconnected

Progress is saved to GDrive every round, logs flush every 30 seconds.

## 10. Resume Training (If Disconnected)

If your Colab session disconnects:
1. Re-run all cells above (keep same EXPERIMENT_NAME and NODE_ID)
2. The system will automatically resume from the last checkpoint
3. Training continues from the last saved round