# VoidBloom / CrisisCore Enhanced Prototype Notebook

This notebook demonstrates the complete Hidden-Gem Scanner pipeline with:
- **Provenance Tracking**: Full lineage tracking of all artifacts and transformations
- **Glossary Generation**: Automatic documentation of technical terms and metrics
- **Feature Extraction**: Time series analysis and market data processing
- **GemScore Calculation**: Weighted scoring with confidence metrics

## 1. Setup and Imports

In [None]:
import pandas as pd
from datetime import datetime, timedelta
from src.core.features import MarketSnapshot
from src.core.safety import evaluate_contract, liquidity_guardrail
from src.core.provenance_tracking import complete_pipeline_tracked
from src.core.provenance import get_provenance_tracker, reset_provenance_tracker
from src.core.glossary import get_glossary

# Reset tracker for clean demo
reset_provenance_tracker()

print("✅ Imports complete")
print(f"📊 Provenance tracker ready")
print(f"📖 Glossary contains {len(get_glossary().terms)} terms")

## 2. Create Synthetic Market Data

In [None]:
# Create synthetic price series showing uptrend
now = datetime.utcnow()
dates = [now - timedelta(hours=i) for i in range(48)][::-1]
prices = pd.Series(
    data=[0.03 + 0.0002 * i for i in range(48)],
    index=pd.to_datetime(dates)
)

# Create market snapshot
snapshot = MarketSnapshot(
    symbol="VBLOOM",
    timestamp=now,
    price=float(prices.iloc[-1]),
    volume_24h=250000,
    liquidity_usd=180000,
    holders=4200,
    onchain_metrics={"active_wallets": 950, "net_inflows": 125000, "unlock_pressure": 0.2},
    narratives=["AI", "DeFi", "VoidBloom"]
)

# Create contract safety report
contract_report = evaluate_contract(
    {"honeypot": False, "owner_can_mint": False, "owner_can_withdraw": False, "unverified": False},
    severity="none"
)

print(f"📈 Generated {len(prices)} price points for {snapshot.symbol}")
print(f"💰 Current price: ${snapshot.price:.6f}")
print(f"💧 Liquidity: ${snapshot.liquidity_usd:,.0f}")
print(f"👥 Holders: {snapshot.holders:,}")
print(f"🛡️ Contract safety score: {contract_report.score:.2f}")

## 3. Execute Pipeline with Provenance Tracking

In [None]:
# Execute complete pipeline with provenance tracking
results = complete_pipeline_tracked(
    snapshot=snapshot,
    price_series=prices,
    narrative_embedding_score=0.72,
    contract_report=contract_report,
    data_source="synthetic_demo"
)

print("=" * 60)
print("📊 ANALYSIS RESULTS")
print("=" * 60)
print(f"\n💎 GemScore: {results['result'].score:.2f}")
print(f"🎯 Confidence: {results['result'].confidence:.2f}%")
print(f"🚩 Flagged for Review: {results['flagged']}")
print(f"\n📋 Feature Contributions:")
for feature, contribution in sorted(results['result'].contributions.items(), key=lambda x: -x[1]):
    print(f"  - {feature}: {contribution:.4f}")

print(f"\n🔍 Debug Info:")
for key, value in results['debug'].items():
    print(f"  - {key}: {value}")

## 4. Explore Artifact Provenance

In [None]:
# Get provenance information
tracker = get_provenance_tracker()
score_id = results['provenance']['score_id']

print("=" * 60)
print("🔍 ARTIFACT PROVENANCE")
print("=" * 60)

# Get the score record
score_record = tracker.get_record(score_id)
if score_record:
    print(f"\n📦 Artifact: {score_record.artifact.name}")
    print(f"🆔 ID: {score_id[:16]}...")
    print(f"📅 Created: {score_record.artifact.created_at.strftime('%Y-%m-%d %H:%M:%S UTC')}")
    print(f"🏷️ Tags: {', '.join(score_record.artifact.tags)}")
    
    print(f"\n🔄 Transformations Applied:")
    for i, transform in enumerate(score_record.transformations, 1):
        print(f"  {i}. {transform.function_name} ({transform.transformation_type.value})")
        print(f"     ⏱️ Duration: {transform.duration_ms:.2f}ms")
    
    print(f"\n📊 Quality Metrics:")
    for metric, value in list(score_record.quality_metrics.items())[:5]:
        print(f"  - {metric}: {value:.4f}")
    
    if score_record.annotations:
        print(f"\n💬 Annotations:")
        for annotation in score_record.annotations:
            print(f"  - {annotation}")

# Get complete lineage
lineage = tracker.get_lineage(score_id)
print(f"\n🌳 Lineage Depth: {len(lineage)} artifacts")
print(f"📈 Artifact chain:")
for i, artifact_id in enumerate(lineage):
    record = tracker.get_record(artifact_id)
    if record:
        print(f"  {i+1}. {record.artifact.artifact_type.value}: {record.artifact.name}")

## 5. Visualize Lineage Graph

In [None]:
# Export lineage as Mermaid diagram
mermaid_diagram = tracker.export_lineage_graph(score_id, format="mermaid")

print("=" * 60)
print("🎨 LINEAGE DIAGRAM (Mermaid)")
print("=" * 60)
print("\nCopy this to https://mermaid.live for visualization:\n")
print(mermaid_diagram)
print("\n" + "=" * 60)

# Get tracker statistics
stats = tracker.get_statistics()
print("\n📊 PROVENANCE STATISTICS")
print("=" * 60)
print(f"Total Artifacts: {stats['total_artifacts']}")
print(f"Total Transformations: {stats['total_transformations']}")
print(f"Lineage Edges: {stats['lineage_edges']}")
print(f"\nArtifacts by Type:")
for artifact_type, count in stats['artifacts_by_type'].items():
    print(f"  - {artifact_type}: {count}")

## 6. Explore Technical Glossary

In [None]:
# Explore the glossary
glossary = get_glossary()

print("=" * 60)
print("📖 TECHNICAL GLOSSARY")
print("=" * 60)

# Get glossary statistics
glossary_stats = glossary.get_statistics()
print(f"\nTotal Terms: {glossary_stats['total_terms']}")
print(f"Categories: {glossary_stats['categories_count']}")
print(f"\nTerms by Category:")
for category, count in glossary_stats['category_breakdown'].items():
    print(f"  - {category}: {count}")

# Show some key terms
print("\n" + "=" * 60)
print("📚 SAMPLE TERMS")
print("=" * 60)

key_terms = ["GemScore", "RSI", "ContractSafety", "Confidence"]
for term_name in key_terms:
    term = glossary.get_term(term_name)
    if term:
        print(f"\n{term.term}")
        print("-" * 40)
        print(f"Category: {term.category.value}")
        print(f"Definition: {term.definition}")
        if term.formula:
            print(f"Formula: {term.formula}")
        if term.range:
            print(f"Range: [{term.range[0]}, {term.range[1]}]")
        if term.related_terms:
            print(f"Related: {', '.join(list(term.related_terms)[:3])}")

## 7. Search Glossary

In [None]:
# Search for terms related to risk
search_results = glossary.search("risk")

print("=" * 60)
print("🔍 SEARCH RESULTS: 'risk'")
print("=" * 60)

for term in search_results:
    print(f"\n✓ {term.term} ({term.category.value})")
    print(f"  {term.definition[:100]}...")

# Get all terms in a specific category
from src.core.glossary import TermCategory

risk_factors = glossary.get_by_category(TermCategory.RISK_FACTOR)
print("\n" + "=" * 60)
print("⚠️ ALL RISK FACTORS")
print("=" * 60)

for term in risk_factors:
    print(f"\n• {term.term}")
    print(f"  {term.definition}")
    if term.range:
        print(f"  Range: [{term.range[0]}, {term.range[1]}]")

## 8. Export Documentation

In [None]:
from pathlib import Path

# Export glossary as markdown
docs_dir = Path("../docs")
docs_dir.mkdir(exist_ok=True)

glossary_path = docs_dir / "GLOSSARY.md"
glossary_markdown = glossary.export_markdown(output_path=glossary_path, include_toc=True, group_by_category=True)

print("=" * 60)
print("📄 EXPORTED DOCUMENTATION")
print("=" * 60)
print(f"\n✅ Glossary exported to: {glossary_path}")
print(f"📏 Document size: {len(glossary_markdown)} characters")

# Export as JSON for programmatic access
glossary_json_path = docs_dir / "glossary.json"
glossary_json = glossary.export_json(output_path=glossary_json_path)

print(f"✅ JSON export to: {glossary_json_path}")

# Show preview of markdown
print("\n" + "=" * 60)
print("📄 GLOSSARY PREVIEW (first 500 chars)")
print("=" * 60)
print(glossary_markdown[:500] + "...")

## 9. Extended Backtest Metrics (IC & Risk-Adjusted Performance)

In [None]:
import numpy as np
from backtest.extended_metrics import (
    calculate_extended_metrics,
    calculate_ic_metrics,
    format_ic_summary,
)

# Create synthetic backtest data
# Simulate 50 token snapshots with predictions and actual returns
np.random.seed(42)

# Generate predictions (GemScores)
predictions = np.random.uniform(0.3, 0.9, 50)

# Generate actual returns with some correlation to predictions
actual_returns = predictions * 0.05 + np.random.normal(0, 0.02, 50)

# Create mock snapshots
class MockSnapshot:
    def __init__(self, token, features, future_return):
        self.token = token
        self.features = features
        self.future_return_7d = future_return

snapshots = [
    MockSnapshot(f"TOKEN{i:02d}", {}, actual_returns[i])
    for i in range(50)
]

print("=" * 70)
print("📊 SYNTHETIC BACKTEST DATA")
print("=" * 70)
print(f"Total Snapshots: {len(snapshots)}")
print(f"Prediction Range: [{predictions.min():.4f}, {predictions.max():.4f}]")
print(f"Return Range: [{actual_returns.min():.4f}, {actual_returns.max():.4f}]")
print(f"Mean Return: {actual_returns.mean():.4f}")
print(f"Return Std Dev: {actual_returns.std():.4f}")

In [None]:
# Calculate Information Coefficient (IC) metrics
ic_metrics = calculate_ic_metrics(predictions, actual_returns)

print("=" * 70)
print("📈 INFORMATION COEFFICIENT ANALYSIS")
print("=" * 70)
print(format_ic_summary(ic_metrics))

# Interpretation
print("\n" + "=" * 70)
print("🔍 IC INTERPRETATION")
print("=" * 70)
print("""
The Information Coefficient measures the correlation between predicted
scores and actual returns. Key insights:

1. **Pearson IC**: Measures linear correlation
   - IC > 0.05: Strong predictive power
   - IC > 0.02: Moderate predictive power
   - IC < 0.02: Weak predictive power

2. **Spearman IC**: Measures rank correlation (robust to outliers)
   - Useful when returns are skewed or have outliers

3. **Kendall's Tau**: Alternative rank correlation
   - More conservative than Spearman

4. **Hit Rate**: Percentage of correct direction predictions
   - > 55%: Better than random for direction
   - > 60%: Strong directional signal

5. **IC IR (Information Ratio)**: IC_mean / IC_std
   - Measures consistency of IC across periods
   - Higher is better (more stable predictions)
""")

In [None]:
# Calculate comprehensive extended metrics
extended_metrics = calculate_extended_metrics(
    snapshots=snapshots,
    predictions=predictions,
    top_k=10,  # Evaluate top 10 predictions
    risk_free_rate=0.0,
    periods_per_year=52,  # Weekly returns
)

print("=" * 70)
print("💰 COMPREHENSIVE BACKTEST METRICS")
print("=" * 70)
print(extended_metrics.summary_string())

In [None]:
# Compare with baseline strategies
from backtest.baseline_strategies import (
    RandomStrategy,
    CapWeightedStrategy,
    SimpleMomentumStrategy,
)
from backtest.extended_metrics import compare_extended_metrics

# Add market cap and momentum features to snapshots for baseline comparison
for i, snap in enumerate(snapshots):
    snap.features = {
        'MarketCap': np.random.uniform(100000, 10000000),
        'PriceChange7d': np.random.uniform(-0.1, 0.2),
    }

# Calculate baseline metrics
baseline_strategies = [
    RandomStrategy(),
    CapWeightedStrategy(),
    SimpleMomentumStrategy(),
]

baseline_metrics = {}
for strategy in baseline_strategies:
    # Select assets using baseline strategy
    selected = strategy.select_assets(snapshots, top_k=10, seed=42)
    
    # Get predictions (uniform for baselines)
    baseline_predictions = np.array([1.0 if snap in selected else 0.0 for snap in snapshots])
    
    # Calculate metrics
    baseline_metrics[strategy.get_name()] = calculate_extended_metrics(
        snapshots=snapshots,
        predictions=baseline_predictions,
        top_k=10,
        risk_free_rate=0.0,
        periods_per_year=52,
    )

# Compare GemScore to baselines
comparisons = compare_extended_metrics(extended_metrics, baseline_metrics)

print("=" * 70)
print("🎯 BASELINE COMPARISONS")
print("=" * 70)
for baseline_name, comparison in comparisons.items():
    print(f"\n{baseline_name.replace('_', ' ').title()}:")
    print(f"  IC Improvement:     {comparison['ic_improvement']:>8.4f}  {'✅' if comparison['ic_better'] else '❌'}")
    print(f"  Sharpe Improvement: {comparison['sharpe_improvement']:>8.4f}  {'✅' if comparison['sharpe_better'] else '❌'}")
    print(f"  Sortino Improvement:{comparison['sortino_improvement']:>8.4f}")
    print(f"  Return Improvement: {comparison['return_improvement']:>8.4f}")
    print(f"  Risk-Adjusted Better: {'✅ YES' if comparison['risk_adjusted_better'] else '❌ NO'}")

In [None]:
# Visualize IC distribution over multiple periods
import pandas as pd
import matplotlib.pyplot as plt

# Simulate multi-period IC
np.random.seed(42)
n_periods = 20
period_ics = []

for period in range(n_periods):
    # Generate predictions and actuals for each period
    period_preds = np.random.uniform(0.3, 0.9, 30)
    period_actuals = period_preds * 0.05 + np.random.normal(0, 0.025, 30)
    
    # Calculate IC for this period
    from scipy import stats
    ic, _ = stats.pearsonr(period_preds, period_actuals)
    period_ics.append(ic)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# IC over time
axes[0, 0].plot(range(1, n_periods + 1), period_ics, marker='o', linewidth=2)
axes[0, 0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0, 0].axhline(y=0.02, color='orange', linestyle='--', alpha=0.5, label='Moderate IC')
axes[0, 0].axhline(y=0.05, color='green', linestyle='--', alpha=0.5, label='Strong IC')
axes[0, 0].set_xlabel('Period')
axes[0, 0].set_ylabel('Information Coefficient')
axes[0, 0].set_title('IC Over Time')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# IC distribution
axes[0, 1].hist(period_ics, bins=15, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(x=np.mean(period_ics), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(period_ics):.4f}')
axes[0, 1].set_xlabel('Information Coefficient')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('IC Distribution')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Predictions vs Actuals scatter
axes[1, 0].scatter(predictions[:30], actual_returns[:30], alpha=0.6)
axes[1, 0].plot([predictions.min(), predictions.max()], 
                [predictions.min() * 0.05, predictions.max() * 0.05], 
                'r--', label='Expected Relationship')
axes[1, 0].set_xlabel('Predicted Score')
axes[1, 0].set_ylabel('Actual Return')
axes[1, 0].set_title('Predictions vs Actual Returns')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Cumulative returns
cumulative_returns = np.cumsum(actual_returns[:30])
axes[1, 1].plot(range(1, 31), cumulative_returns, linewidth=2)
axes[1, 1].fill_between(range(1, 31), 0, cumulative_returns, alpha=0.3)
axes[1, 1].set_xlabel('Asset Index (sorted by GemScore)')
axes[1, 1].set_ylabel('Cumulative Return')
axes[1, 1].set_title('Cumulative Returns (Top 30)')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/ic_analysis.png', dpi=150, bbox_inches='tight')
print("📊 Visualization saved to: ../docs/ic_analysis.png")
plt.show()

# Print IC statistics
print("\n" + "=" * 70)
print("📊 IC STATISTICS (Multi-Period)")
print("=" * 70)
print(f"Mean IC:        {np.mean(period_ics):>8.4f}")
print(f"Median IC:      {np.median(period_ics):>8.4f}")
print(f"Std Dev IC:     {np.std(period_ics):>8.4f}")
print(f"IC IR:          {np.mean(period_ics) / np.std(period_ics):>8.4f}")
print(f"Min IC:         {np.min(period_ics):>8.4f}")
print(f"Max IC:         {np.max(period_ics):>8.4f}")
print(f"Positive ICs:   {sum(ic > 0 for ic in period_ics)}/{n_periods} ({100 * sum(ic > 0 for ic in period_ics) / n_periods:.1f}%)")