# Coverage Dashboard

Visualize documentation coverage and identify gaps in the knowledge base.

In [None]:
from _paths import *
import json
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

## 1. Tier Distribution

How many events are at each verification tier?

In [None]:
schemas = load_discovered_schemas()

# Count by tier
tier_counts = {'THEORETICAL': 0, 'OBSERVED': 0, 'VERIFIED': 0, 'CANONICAL': 0}
for schema in schemas.values():
    tier = schema.get('tier', 'OBSERVED')
    tier_counts[tier] = tier_counts.get(tier, 0) + 1

# Create bar chart
fig, ax = plt.subplots(figsize=(10, 5))
tiers = list(tier_counts.keys())
counts = list(tier_counts.values())
colors = ['#ff6b6b', '#ffd93d', '#6bcf7a', '#4dabf7']

bars = ax.bar(tiers, counts, color=colors)
ax.set_ylabel('Number of Events')
ax.set_title('Event Distribution by Verification Tier')

# Add count labels
for bar, count in zip(bars, counts):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            str(count), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Summary stats
total = sum(counts)
canonical_pct = tier_counts['CANONICAL'] / total * 100 if total > 0 else 0
print(f"\nTotal Events: {total}")
print(f"CANONICAL Coverage: {canonical_pct:.1f}%")

## 2. Field Coverage by Event

Which events have the most/least documented fields?

In [None]:
# Create dataframe of events
event_data = []
for event_name, schema in schemas.items():
    fields = schema.get('fields', {})
    documented = sum(1 for f in fields.values() if f.get('description'))
    event_data.append({
        'event': event_name,
        'tier': schema.get('tier', 'OBSERVED'),
        'total_fields': len(fields),
        'documented_fields': documented,
        'samples': schema.get('sample_count', 0)
    })

df = pd.DataFrame(event_data)
if not df.empty:
    df['coverage_pct'] = (df['documented_fields'] / df['total_fields'] * 100).fillna(0)
    df = df.sort_values('total_fields', ascending=False)
    
    print("Top 15 Events by Field Count:")
    display(df.head(15)[['event', 'tier', 'total_fields', 'documented_fields', 'coverage_pct', 'samples']])
else:
    print("No event data available.")

## 3. Sample Distribution

Which events have the most observations?

In [None]:
if not df.empty:
    # Top events by sample count
    top_sampled = df.nlargest(15, 'samples')
    
    fig, ax = plt.subplots(figsize=(12, 6))
    bars = ax.barh(top_sampled['event'], top_sampled['samples'], color='steelblue')
    ax.set_xlabel('Sample Count')
    ax.set_title('Top 15 Events by Sample Count')
    ax.invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nTotal samples across all events: {df['samples'].sum():,}")
    print(f"Median samples per event: {df['samples'].median():.0f}")

## 4. ChromaDB Collection Stats

In [None]:
try:
    client = get_chromadb_client()
    collections = client.list_collections()
    
    print("ChromaDB Collections:")
    print("=" * 40)
    
    collection_data = []
    for coll in collections:
        count = coll.count()
        collection_data.append({
            'collection': coll.name,
            'documents': count
        })
        print(f"  {coll.name}: {count:,} documents")
    
    if collection_data:
        fig, ax = plt.subplots(figsize=(8, 4))
        coll_df = pd.DataFrame(collection_data)
        ax.bar(coll_df['collection'], coll_df['documents'], color='coral')
        ax.set_ylabel('Documents')
        ax.set_title('Documents per Collection')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

except Exception as e:
    print(f"ChromaDB not available: {e}")
    print("Run: cd rag-pipeline && python -m ingestion.ingest")

## 5. Documentation Gaps

Events with high sample counts but not yet CANONICAL:

In [None]:
if not df.empty:
    # High-sample events not yet canonical
    gaps = df[(df['samples'] > 100) & (df['tier'] != 'CANONICAL')]
    gaps = gaps.sort_values('samples', ascending=False)
    
    print(f"Events with 100+ samples not yet CANONICAL: {len(gaps)}\n")
    
    if not gaps.empty:
        print("Priority candidates for promotion:")
        display(gaps.head(10)[['event', 'tier', 'samples', 'total_fields']])
    else:
        print("All high-sample events are CANONICAL!")

## 6. Field Type Distribution

In [None]:
fields = load_discovered_fields()

# Count field types
type_counts = {}
for field_info in fields.values():
    field_type = field_info.get('type', 'unknown')
    type_counts[field_type] = type_counts.get(field_type, 0) + 1

if type_counts:
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Sort and limit to top types
    sorted_types = sorted(type_counts.items(), key=lambda x: -x[1])
    labels = [t[0] for t in sorted_types[:8]]
    sizes = [t[1] for t in sorted_types[:8]]
    
    # Add 'other' category
    other_count = sum(t[1] for t in sorted_types[8:])
    if other_count > 0:
        labels.append('other')
        sizes.append(other_count)
    
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    ax.set_title('Field Type Distribution')
    plt.show()
    
    print(f"\nTotal field paths: {len(fields)}")
    print(f"Unique types: {len(type_counts)}")
else:
    print("No field data available.")

## 7. Coverage Summary

In [None]:
# Generate summary report
print("KNOWLEDGE BASE COVERAGE REPORT")
print("=" * 50)
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print()

print("Events:")
for tier, count in tier_counts.items():
    pct = count / sum(tier_counts.values()) * 100 if sum(tier_counts.values()) > 0 else 0
    print(f"  {tier}: {count} ({pct:.1f}%)")

print(f"\nFields:")
print(f"  Total paths: {len(fields)}")
print(f"  Unique types: {len(type_counts) if type_counts else 0}")

if not df.empty:
    print(f"\nSamples:")
    print(f"  Total: {df['samples'].sum():,}")
    print(f"  Median per event: {df['samples'].median():.0f}")

print(f"\nPromotion Queue:")
verified_count = tier_counts.get('VERIFIED', 0)
print(f"  VERIFIED (pending human approval): {verified_count}")

from datetime import datetime

## Next Steps

- **02_canonical_review.ipynb** - Approve pending promotions
- **04_rl_bot_analysis.ipynb** - Analyze RL model performance