# Semantic Gravity Mapping (SGM)

This notebook implements a system to map LLM semantic structure by generating word association graphs.

**What it does:**
1. **Phase 1 - Seed & Crawl**: Generate association graph via BFS from 100 seed concepts
2. **Phase 2 - Logprob Scoring**: Weight edges using logprob extraction
3. **Phase 3 - Topology Analysis**: Analyze hubs, convergence, islands, and asymmetry

**Expected Runtime:** 45-85 minutes on Colab T4 GPU

---

## Cell 1: Environment Setup

Detect environment (Colab vs local), check GPU, install dependencies, mount Google Drive.

In [None]:
import os
import sys
from pathlib import Path

# Detect if running in Colab
IN_COLAB = 'google.colab' in sys.modules

print(f"Running in: {'Google Colab' if IN_COLAB else 'Local Environment'}")

if IN_COLAB:
    print("\n=== Setting up Colab Environment ===")
    
    # Check GPU
    print("\n1. Checking GPU...")
    !nvidia-smi --query-gpu=name,memory.total --format=csv
    
    # Install dependencies
    print("\n2. Installing dependencies (this may take 5-10 minutes)...")
    !pip install -q vllm networkx tqdm matplotlib seaborn
    
    # Mount Google Drive
    print("\n3. Mounting Google Drive...")
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Clone repo if not already present
    if not Path('/content/align_prompts').exists():
        print("\n4. Cloning repository...")
        !git clone https://github.com/YOUR_USERNAME/align_prompts.git /content/align_prompts
    
    # Add to path
    sys.path.insert(0, '/content/align_prompts')
    
    # Create checkpoint and output directories
    !mkdir -p /content/drive/MyDrive/sgm_checkpoints
    !mkdir -p /content/drive/MyDrive/sgm_outputs
    
    print("\n‚úÖ Colab environment ready!")
    
else:
    print("\nLocal environment detected. Make sure you have:")
    print("  - vllm installed: pip install vllm")
    print("  - networkx installed: pip install networkx")
    print("  - tqdm installed: pip install tqdm")
    print("  - vLLM server running on localhost:8000")

## Cell 2: Configuration

Set model, paths, and hyperparameters.

In [None]:
# Configuration
CONFIG = {
    # Model settings
    'model': 'google/gemma-3-4b-it',  # Model to use
    'vllm_base_url': 'http://localhost:8000/v1',  # vLLM server URL
    
    # Graph generation settings
    'max_hops': 3,  # BFS depth (3 = ~15k edges)
    'associations_per_word': 5,  # Associations per word
    
    # Checkpoint settings
    'checkpoint_dir': '/content/drive/MyDrive/sgm_checkpoints' if IN_COLAB else './data/sgm/checkpoints',
    'output_dir': '/content/drive/MyDrive/sgm_outputs' if IN_COLAB else './data/sgm/graphs',
    
    # Optimization settings
    'temperature_associations': 0.7,  # Temperature for Phase 1
    'temperature_scoring': 0.0,  # Temperature for Phase 2 (deterministic)
    'batch_size': 32,  # Concurrent requests
    
    # Resume settings
    'resume': True  # Resume from checkpoint if available
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## Cell 3: Start vLLM Server (Colab Only)

Launch vLLM server in background with GPU.

In [None]:
if IN_COLAB:
    print("Starting vLLM server in background...")
    print(f"Model: {CONFIG['model']}")
    print("\nThis may take 2-3 minutes to download and load the model.")
    
    # Start vLLM server in background
    vllm_cmd = f"""
    nohup python -m vllm.entrypoints.openai.api_server \
        --model {CONFIG['model']} \
        --gpu-memory-utilization 0.9 \
        --max-model-len 2048 \
        --port 8000 \
        > /tmp/vllm_server.log 2>&1 &
    """
    
    !{vllm_cmd}
    
    # Wait for server to be ready
    import time
    import requests
    
    print("\nWaiting for vLLM server to start...")
    for i in range(60):  # Wait up to 60 seconds
        try:
            response = requests.get('http://localhost:8000/health')
            if response.status_code == 200:
                print("\n‚úÖ vLLM server is ready!")
                break
        except:
            pass
        time.sleep(2)
        if i % 10 == 0:
            print(f"  Still waiting... ({i*2}s)")
    else:
        print("\n‚ö†Ô∏è Server didn't respond in time. Check logs: !tail /tmp/vllm_server.log")
        
else:
    print("Local environment - assuming vLLM server is already running.")
    print(f"Make sure vLLM is serving {CONFIG['model']} on {CONFIG['vllm_base_url']}")

## Cell 4: Initialize Components

Create engine, checkpoint manager, and check for existing checkpoints.

In [None]:
from align_test.core.vllm_client import VLLMClient
from align_test.sgm.inference.batch_inference import SGMInferenceEngine
from align_test.sgm.storage.checkpoint_manager import CheckpointManager
from align_test.sgm.models.seed_domains import get_all_seeds, get_domain_names

# Initialize vLLM client
print("Initializing components...\n")

vllm_client = VLLMClient(
    base_url=CONFIG['vllm_base_url'],
    model=CONFIG['model']
)

print(f"‚úì VLLMClient: {vllm_client}")

# Initialize inference engine
inference_engine = SGMInferenceEngine(
    client=vllm_client,
    temperature=CONFIG['temperature_associations'],
    batch_size=CONFIG['batch_size']
)

print(f"‚úì SGMInferenceEngine: {inference_engine}")

# Initialize checkpoint manager
checkpoint_manager = CheckpointManager(
    checkpoint_dir=CONFIG['checkpoint_dir'],
    config=CONFIG
)

print(f"‚úì CheckpointManager: {checkpoint_manager.checkpoint_dir}")

# Check for existing checkpoints
resume_info = checkpoint_manager.get_resume_info()

if resume_info:
    print(f"\nüìÅ Found checkpoint: Phase {resume_info['phase']}, Iteration {resume_info['iteration']}")
    print(f"   Timestamp: {resume_info['timestamp']}")
    print(f"   Can resume from: {resume_info['checkpoint_file']}")
else:
    print("\nüìÅ No existing checkpoints found - starting fresh")

# Display seed information
seeds = get_all_seeds()
domains = get_domain_names()

print(f"\nüå± Seeds: {len(seeds)} concepts across {len(domains)} domains")
print(f"   Domains: {', '.join(domains)}")
print(f"   Sample seeds: {', '.join(seeds[:5])}...")

## Cell 5: Phase 1 - Seed & Crawl

Generate association graph via BFS expansion. This will take 30-60 minutes on T4 GPU.

In [None]:
from align_test.sgm.core.graph_builder import GraphBuilder

print("=" * 70)
print("PHASE 1: SEED & CRAWL - BFS Graph Generation")
print("=" * 70)

# Initialize graph builder
graph_builder = GraphBuilder(
    engine=inference_engine,
    checkpoint_manager=checkpoint_manager,
    checkpoint_interval=500
)

# Build graph
import time
start_time = time.time()

raw_graph = graph_builder.build_graph(
    max_hops=CONFIG['max_hops'],
    associations_per_word=CONFIG['associations_per_word'],
    resume=CONFIG['resume']
)

elapsed_time = time.time() - start_time

print(f"\n‚è±Ô∏è  Phase 1 completed in {elapsed_time/60:.1f} minutes")
print(f"\nüìä Final Graph:")
print(f"   Nodes: {raw_graph.number_of_nodes():,}")
print(f"   Edges: {raw_graph.number_of_edges():,}")

# Get statistics
stats = graph_builder.get_statistics()
print(f"\nüìà Statistics:")
print(f"   Visited nodes: {stats['num_visited']:,}")
print(f"   Avg out-degree: {stats['avg_out_degree']:.2f}")
print(f"   Max out-degree: {stats['max_out_degree']}")

## Cell 6: Phase 1 Results Preview

Visualize sample associations and graph structure.

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

print("=" * 70)
print("PHASE 1: RESULTS PREVIEW")
print("=" * 70)

# Show sample paths
print("\nüîç Sample Association Paths:")
graph_builder.preview_sample_paths(n=5)

# Analyze hop distribution
hop_counts = {}
for _, _, data in raw_graph.edges(data=True):
    hop = data.get('hop', 0)
    hop_counts[hop] = hop_counts.get(hop, 0) + 1

print("\nüìä Edge Distribution by Hop:")
for hop in sorted(hop_counts.keys()):
    count = hop_counts[hop]
    print(f"   Hop {hop}: {count:,} edges ({count/raw_graph.number_of_edges()*100:.1f}%)")

# Visualize hop distribution
plt.figure(figsize=(10, 5))
plt.bar(hop_counts.keys(), hop_counts.values())
plt.xlabel('Hop')
plt.ylabel('Number of Edges')
plt.title('Edge Distribution by BFS Hop')
plt.grid(True, alpha=0.3)
plt.show()

# Show top nodes by degree
degree_dict = dict(raw_graph.in_degree())
top_nodes = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]

print("\nüéØ Top 10 Nodes by In-Degree (most associated with):")
for i, (node, degree) in enumerate(top_nodes, 1):
    print(f"   {i:2d}. {node:20s} (degree: {degree})")

## Cell 7: Phase 2 - Logprob Scoring

Score edge weights using logprob extraction. This will take 10-20 minutes.

In [None]:
from align_test.sgm.core.logprob_scorer import LogprobScorer

print("=" * 70)
print("PHASE 2: LOGPROB SCORING - Edge Weight Assignment")
print("=" * 70)

# Update engine temperature for deterministic scoring
inference_engine.temperature = CONFIG['temperature_scoring']

# Initialize scorer
logprob_scorer = LogprobScorer(
    engine=inference_engine,
    checkpoint_manager=checkpoint_manager,
    checkpoint_interval=2000
)

# Score all edges
import time
start_time = time.time()

weighted_graph = logprob_scorer.score_all_edges(
    graph=raw_graph,
    resume=CONFIG['resume'],
    show_progress=True
)

elapsed_time = time.time() - start_time

print(f"\n‚è±Ô∏è  Phase 2 completed in {elapsed_time/60:.1f} minutes")

# Get weight statistics
weight_stats = logprob_scorer.get_weight_statistics(weighted_graph)

print(f"\nüìä Weight Statistics:")
print(f"   Mean: {weight_stats['mean_weight']:.4f}")
print(f"   Median: {weight_stats['median_weight']:.4f}")
print(f"   Min: {weight_stats['min_weight']:.4f}")
print(f"   Max: {weight_stats['max_weight']:.4f}")
print(f"   Std: {weight_stats['std_weight']:.4f}")
print(f"   Scored edges: {weight_stats['num_scored']:,}")

## Cell 8: Phase 2 Results Preview

Analyze strongest and weakest associations.

In [None]:
print("=" * 70)
print("PHASE 2: RESULTS PREVIEW")
print("=" * 70)

# Show top edges (strongest associations)
top_edges = logprob_scorer.get_top_edges(weighted_graph, n=10, sort_by='weight')

print("\nüí™ Top 10 Strongest Associations (by weight):")
for i, (u, v, w) in enumerate(top_edges, 1):
    print(f"   {i:2d}. {u:15s} ‚Üí {v:15s} (weight: {w:.4f})")

# Show bottom edges (weakest associations)
bottom_edges = logprob_scorer.get_bottom_edges(weighted_graph, n=10, sort_by='weight')

print("\nüîª Top 10 Weakest Associations (by weight):")
for i, (u, v, w) in enumerate(bottom_edges, 1):
    print(f"   {i:2d}. {u:15s} ‚Üí {v:15s} (weight: {w:.4f})")

# Visualize weight distribution
print("\nüìä Weight Distribution:")
logprob_scorer.visualize_weight_distribution(weighted_graph)

## Cell 9: Phase 3 - Topology Analysis

Analyze graph topology to find hubs, convergence patterns, islands, and asymmetries.

In [None]:
from align_test.sgm.core.topology_analyzer import TopologyAnalyzer

print("=" * 70)
print("PHASE 3: TOPOLOGY ANALYSIS")
print("=" * 70)

# Initialize analyzer
analyzer = TopologyAnalyzer(weighted_graph)

# Run all analyses
import time
start_time = time.time()

results = analyzer.analyze_all()

elapsed_time = time.time() - start_time

print(f"\n‚è±Ô∏è  Phase 3 completed in {elapsed_time:.1f} seconds")

# Print summary
analyzer.print_summary(results)

# Export results to JSON
output_path = Path(CONFIG['output_dir']) / 'topology_metrics.json'
analyzer.export_results(results, str(output_path))

# Save final graph
print("\nüíæ Saving final graph...")
checkpoint_manager.save_graph(
    graph=weighted_graph,
    filename='semantic_graph_final',
    include_metadata=True
)
print(f"   Graph saved to: {CONFIG['output_dir']}/semantic_graph_final.gpickle")
print(f"   Edge list (CSV): {CONFIG['output_dir']}/semantic_graph_final.csv")

## Cell 10: Results Summary & Export

Final summary and download instructions.

In [None]:
print("=" * 70)
print("üéâ SEMANTIC GRAVITY MAPPING - COMPLETE!")
print("=" * 70)

# Summary of findings
print("\nüìù Key Findings:")
print("\n1. Semantic Attractors (Hubs):")
top_hubs = results['hubs'][:5]
for hub in top_hubs:
    print(f"   ‚Ä¢ {hub['word']} (PageRank: {hub['pagerank']:.4f})")

print("\n2. Convergence Analysis:")
conv = results['convergence']
print(f"   ‚Ä¢ Overall avg hops to hubs: {conv['overall_avg_hops']:.2f}")
fastest_domain = min(conv['by_domain'].items(), key=lambda x: x[1]['avg_hops'])
print(f"   ‚Ä¢ Fastest converging domain: {fastest_domain[0]} ({fastest_domain[1]['avg_hops']:.2f} hops)")

print("\n3. Isolated Domains (Islands):")
if results['islands']:
    for island in results['islands'][:3]:
        print(f"   ‚Ä¢ Size {island['size']}: {', '.join(island['words'][:3])}...")
else:
    print("   ‚Ä¢ No isolated clusters found")

print("\n4. Asymmetric Associations (Narrative Bias):")
for pair in results['asymmetry'][:3]:
    print(f"   ‚Ä¢ {pair['source']} ‚Üí {pair['target']}: {pair['asymmetry']:.3f}")

# Output files
print("\n\nüì¶ Output Files:")
print(f"   ‚Ä¢ Graph (pickle): {CONFIG['output_dir']}/semantic_graph_final.gpickle")
print(f"   ‚Ä¢ Edge list (CSV): {CONFIG['output_dir']}/semantic_graph_final.csv")
print(f"   ‚Ä¢ Metrics (JSON): {CONFIG['output_dir']}/topology_metrics.json")
print(f"   ‚Ä¢ Checkpoints: {CONFIG['checkpoint_dir']}/")

if IN_COLAB:
    print("\nüí° Next Steps:")
    print("   1. Download files from Google Drive: sgm_outputs/")
    print("   2. Import CSV to Gephi for visualization")
    print("   3. Analyze metrics JSON for detailed insights")
    
    # Offer to download
    from google.colab import files
    
    download = input("\nDownload results now? (y/n): ")
    if download.lower() == 'y':
        print("Downloading...")
        files.download(f"{CONFIG['output_dir']}/semantic_graph_final.csv")
        files.download(f"{CONFIG['output_dir']}/topology_metrics.json")
        print("‚úÖ Download complete!")

print("\n" + "=" * 70)
print("Thank you for using Semantic Gravity Mapping!")
print("=" * 70)