# Social RL: Learning Through Interaction

This notebook demonstrates the Social RL architecture with **vLLM + Qwen** - a novel approach where agents learn from social feedback without explicit reward functions.

| Traditional RL | Social RL |
|---------------|----------|
| Environment | Other agents + theoretical constraints |
| State | Round context + concept manifestations |
| Action | Agent utterance/response |
| Reward | Social feedback (engagement, alignment, contribution) |
| Policy | PRAR process schemas |

## 1. Environment Setup

In [None]:
# Clear GPU and prepare environment
import gc
import torch
import os

print("Stopping old processes...")
!pkill -f vllm

print("Freeing GPU memory...")
gc.collect()
torch.cuda.empty_cache()

!nvidia-smi

In [None]:
# Install dependencies
!pip install -q "vllm==0.6.6" openai

## 2. Launch vLLM Server

In [None]:
import os
import time
import subprocess

os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

# Kill any existing vLLM processes
!pkill -f vllm 2>/dev/null || true
time.sleep(2)

print("Starting vLLM server...")
!nohup python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-7B-Instruct \
  --dtype bfloat16 \
  --port 8000 \
  --host 0.0.0.0 \
  > vllm.log 2>&1 &

# Wait and check for server startup with retries
print("Waiting for server to start...")
max_wait = 120  # 2 minutes max
wait_interval = 10
elapsed = 0

while elapsed < max_wait:
    time.sleep(wait_interval)
    elapsed += wait_interval
    print(f"  [{elapsed}s] Checking server status...")
    
    # Check if server is responding
    result = !curl -s http://127.0.0.1:8000/v1/models 2>/dev/null
    if result and 'Qwen' in str(result):
        print(f"✓ Server ready after {elapsed}s!")
        break
    
    # Check logs for errors
    !tail -n 5 vllm.log
else:
    print(f"✗ Server did not start within {max_wait}s")
    print("\nFull log:")
    !cat vllm.log

In [None]:
# Verify server is running
!curl -s http://127.0.0.1:8000/v1/models | python -m json.tool

## 3. Clone Repository

In [None]:
import os
import sys

# Handle both Colab and local/VS Code environments
if os.path.exists('/content'):
    # Running on Colab
    if not os.path.exists('/content/Socratic-RCM'):
        !git clone https://github.com/Baglecake/Socratic-RCM.git /content/Socratic-RCM
    else:
        !cd /content/Socratic-RCM && git pull
    %cd /content/Socratic-RCM
    repo_root = '/content/Socratic-RCM'
else:
    # Local or VS Code - assume we're in the repo already
    repo_root = os.getcwd()
    # If we're in notebooks/, go up one level
    if os.path.basename(repo_root) == 'notebooks':
        repo_root = os.path.dirname(repo_root)
        os.chdir(repo_root)

print(f"Working directory: {os.getcwd()}")

# Add paths
sys.path.insert(0, os.path.join(repo_root, 'social_rl'))
sys.path.insert(0, os.path.join(repo_root, 'agents'))
sys.path.insert(0, os.path.join(repo_root, 'local_rcm'))

# Verify social_rl exists
print(f"\nsocial_rl contents:")
!ls -la social_rl/

## 4. Create LLM Client

In [None]:
from openai import OpenAI
import os
import time

class VLLMClient:
    """LLM client using vLLM OpenAI-compatible API."""
    
    def __init__(self, base_url="http://127.0.0.1:8000/v1", model="Qwen/Qwen2.5-7B-Instruct"):
        self.client = OpenAI(api_key="not-needed", base_url=base_url)
        self.model = model
        self.call_count = 0
    
    def send_message(self, system_prompt: str, user_message: str, temperature: float = 0.7) -> str:
        self.call_count += 1
        system_prompt += "\n\nIMPORTANT: Always respond in English only."
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=temperature,
            max_tokens=512
        )
        return response.choices[0].message.content

class OpenAIClient:
    """LLM client using OpenAI API directly."""
    
    def __init__(self, model="gpt-4o-mini"):
        self.client = OpenAI()  # Uses OPENAI_API_KEY env var
        self.model = model
        self.call_count = 0
    
    def send_message(self, system_prompt: str, user_message: str, temperature: float = 0.7) -> str:
        self.call_count += 1
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=temperature,
            max_tokens=512
        )
        return response.choices[0].message.content

# Try vLLM first, then OpenAI API
llm = None

# Option 1: vLLM (local Colab server)
print("Trying vLLM...")
for attempt in range(3):
    try:
        llm = VLLMClient()
        test = llm.send_message("You are helpful.", "Say 'ready'")
        print(f"✓ vLLM connected!")
        break
    except Exception as e:
        if attempt == 2:
            print(f"✗ vLLM failed: {e}")

# Option 2: OpenAI API (if vLLM failed and key exists)
if llm is None or llm.call_count == 0:
    if os.environ.get("OPENAI_API_KEY"):
        print("\nTrying OpenAI API...")
        try:
            llm = OpenAIClient()
            test = llm.send_message("You are helpful.", "Say 'ready'")
            print(f"✓ OpenAI API connected!")
        except Exception as e:
            print(f"✗ OpenAI failed: {e}")
            llm = None

if llm and llm.call_count > 0:
    print(f"\n✓ LLM ready: {type(llm).__name__}")
else:
    raise RuntimeError("No LLM available. Start vLLM server or set OPENAI_API_KEY")

## 5. Load Canvas & Create Social RL Runner

In [None]:
import json
import os
from runner import SocialRLRunner, SocialRLConfig

# Find state file - works from any directory
possible_paths = [
    'prar/outputs/2025-11-23_baseline_full_qwen/state.json',  # From repo root
    '/content/Socratic-RCM/prar/outputs/2025-11-23_baseline_full_qwen/state.json',  # Colab
    '../prar/outputs/2025-11-23_baseline_full_qwen/state.json',  # From notebooks dir
]

state_path = None
for path in possible_paths:
    if os.path.exists(path):
        state_path = path
        break

if not state_path:
    raise FileNotFoundError(f"Could not find state.json. Tried: {possible_paths}")

with open(state_path, 'r') as f:
    canvas = json.load(f)['canvas']

print(f"Loaded from: {state_path}")
print(f"Framework: {canvas['project'].get('theoretical_option_label')}")
print(f"Agents: {[a['identifier'] for a in canvas['agents']]}")
print(f"Rounds: {len(canvas['rounds'])}")

In [None]:
# Create Social RL Runner with vLLM
config = SocialRLConfig(
    manifestation_mode="progressive",  # Options: static, progressive, reactive, adaptive
    use_prar_cues=True,
    prar_intensity="medium",
    use_coach_validation=False,  # Set True to enable Coach validation
    verbose=True
)

runner = SocialRLRunner(canvas, llm, config)
print("\nSocialRLRunner created with vLLM backend!")

## 6. Execute Social RL Round 1

In [None]:
# Execute Round 1
result1 = runner.execute_round(round_number=1, max_turns=9)  # 3 agents x 3 turns each
print(f"\nRound 1 complete: {len(result1.messages)} messages")

In [None]:
# View Round 1 Feedback
print("=" * 60)
print("ROUND 1 FEEDBACK ANALYSIS")
print("=" * 60)

for agent_id, fb in result1.feedback.items():
    print(f"\n{agent_id}:")
    print(f"  Engagement: {fb.engagement:.2f}")
    print(f"  Theoretical Alignment: {fb.theoretical_alignment:.2f}")
    print(f"  Contribution Value: {fb.contribution_value:.2f}")
    print(f"  Concepts Embodied: {list(set(fb.concepts_embodied))}")

## 7. Execute Round 2 (With Policy Adaptation)

In [None]:
# Execute Round 2 - policies adapt based on Round 1 feedback
print("\n" + "=" * 60)
print("ROUND 2 - With policy adaptation from Round 1 feedback")
print("=" * 60)

result2 = runner.execute_round(round_number=2, max_turns=9)
print(f"\nRound 2 complete: {len(result2.messages)} messages")

if result2.policy_adaptations:
    print("\nPolicy Adaptations Made:")
    for adapt in result2.policy_adaptations:
        print(f"  {adapt['agent_id']}: {adapt['deltas']}")

In [None]:
# Compare feedback across rounds
print("=" * 60)
print("FEEDBACK DELTA (Round 1 -> Round 2)")
print("=" * 60)

comparison = runner.feedback_extractor.compare_rounds(1, 2)
for agent_id, deltas in comparison.items():
    print(f"\n{agent_id}:")
    for signal, delta in deltas.items():
        direction = "+" if delta > 0 else ""
        print(f"  {signal}: {direction}{delta:.3f}")

## 8. Full Simulation Report

In [None]:
# Generate comprehensive report
print(runner.generate_report())

In [None]:
# Results are auto-saved! Check the output directory
print(f"Results auto-saved to: {runner.output_dir}")
!ls -la {runner.output_dir}/

## 9. Examine Dynamic Context Injection

In [None]:
# Show how context evolved for one agent across turns
alice_messages = [m for m in result1.messages if m.agent_id == "Worker+Alice"]

print("=" * 60)
print("ALICE'S CONTEXT EVOLUTION (Round 1)")
print("=" * 60)

for msg in alice_messages:
    print(f"\n--- Turn {msg.turn_number} ---")
    print(f"Response: {msg.content[:200]}...")
    if msg.prar_cue_used:
        print(f"\nPRAR Cue: {msg.prar_cue_used[:150]}...")
    if msg.feedback_snapshot:
        print(f"Feedback at turn: {msg.feedback_snapshot}")

## Key Insights

### What Makes This "Social RL"?

1. **No Explicit Reward Function**: Learning signals emerge from interaction
   - Being referenced = engagement signal
   - Embodying concepts = alignment signal
   - Inclusion in synthesis = contribution signal

2. **Process Retrieval as Policy**: PRAR guides HOW to reason, not WHAT to say
   - Policies adapt based on feedback deltas
   - No model weight updates needed

3. **Dynamic Context Injection**: Each turn gets fresh manifestations
   - Intensity scales: low → medium → high through the round
   - Context reacts to conversation dynamics

4. **Theoretical Grounding**: Framework prevents drift
   - Concepts constrain the possibility space
   - Analyst coding reinforces alignment