In [2]:
# Install required packages
%pip install yfiles_jupyter_graphs --quiet
import json
import os
import math
import random
import re
from typing import Dict, Any
from yfiles_jupyter_graphs import GraphWidget

# Function to find the most recent knowledge graph file
def find_latest_kg_file():
    """Find the most recent knowledge graph file"""
    import glob
    from pathlib import Path
    
    # Look for knowledge graph files
    kg_files = glob.glob("cia_ufo_output/ufo_*kg_*.json")
    if not kg_files:
        print("No knowledge graph files found!")
        return None
    
    # Sort by modification time (most recent first)
    kg_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
    return kg_files[0]

# Load the knowledge graph
def load_knowledge_graph(filepath=None):
    """Load the knowledge graph from a JSON file"""
    if not filepath:
        filepath = find_latest_kg_file()
        if not filepath:
            return None
    
    print(f"Loading knowledge graph from {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        kg_data = json.load(f)
    
    print(f"Loaded knowledge graph with {len(kg_data.get('entities', []))} entities and {len(kg_data.get('relationships', []))} relationships")
    return kg_data

# Clean and normalize entity names
def normalize_entity_name(name):
    """Clean and normalize entity names"""
    if not name:
        return name
    
    # Replace linebreaks and multiple spaces with a single space
    name = re.sub(r'\s+', ' ', name)
    
    # Remove punctuation at the beginning or end
    name = name.strip('.,;:()[]{}"\'-')
    
    # Trim extra whitespace
    name = name.strip()
    
    return name

# Get entity color based on type
def get_entity_color(entity_type):
    """Return a color based on entity type"""
    color_map = {
        "document": "#1f78b4",    # Blue
        "person": "#33a02c",      # Green
        "location": "#e31a1c",    # Red
        "time": "#ff7f00",        # Orange
        "organization": "#6a3d9a", # Purple
        "concept": "#fdbf6f",     # Light orange
        "collection": "#a6cee3",  # Light blue
        "file": "#cab2d6",        # Light purple
        "unknown": "#cccccc"      # Gray
    }
    return color_map.get(entity_type.lower(), "#cccccc")

# Clean and deduplicate the knowledge graph
def clean_knowledge_graph(kg_data):
    """Clean and deduplicate entities in the knowledge graph"""
    if not kg_data or not kg_data.get("entities"):
        return kg_data
    
    # Create a mapping of normalized names to original entities
    normalized_entities = {}
    entity_id_mapping = {}  # Maps old IDs to new IDs
    
    # First pass: normalize entity names and find duplicates
    for entity in kg_data["entities"]:
        entity_id = entity.get("id", "unknown")
        entity_name = entity.get("name", entity_id)
        entity_type = entity.get("type", "unknown")
        
        # Normalize the name
        normalized_name = normalize_entity_name(entity_name)
        entity["name"] = normalized_name  # Update the entity with the normalized name
        
        # Create a key combining name and type to handle same name, different type
        key = f"{normalized_name.lower()}||{entity_type.lower()}"
        
        if key in normalized_entities:
            # This is a duplicate, record the mapping
            entity_id_mapping[entity_id] = normalized_entities[key]["id"]
        else:
            # This is a new entity
            normalized_entities[key] = entity
    
    # Create a new deduplicated entities list
    new_entities = list(normalized_entities.values())
    
    # Update relationships to use the new entity IDs
    new_relationships = []
    for relationship in kg_data.get("relationships", []):
        source = relationship.get("source")
        target = relationship.get("target")
        
        # Map source and target to new IDs if they were duplicates
        new_source = entity_id_mapping.get(source, source)
        new_target = entity_id_mapping.get(target, target)
        
        # Only include relationships where both source and target still exist
        source_exists = any(e["id"] == new_source for e in new_entities)
        target_exists = any(e["id"] == new_target for e in new_entities)
        
        if source_exists and target_exists:
            # Create a new relationship with the updated IDs
            new_rel = relationship.copy()
            new_rel["source"] = new_source
            new_rel["target"] = new_target
            new_relationships.append(new_rel)
    
    # Remove duplicate relationships
    unique_relationships = []
    relationship_keys = set()
    
    for rel in new_relationships:
        rel_key = f"{rel['source']}||{rel['target']}||{rel.get('type', '')}"
        if rel_key not in relationship_keys:
            relationship_keys.add(rel_key)
            unique_relationships.append(rel)
    
    # Create the new cleaned knowledge graph
    cleaned_kg = {
        "entities": new_entities,
        "relationships": unique_relationships
    }
    
    print(f"Cleaned knowledge graph: {len(new_entities)} entities and {len(unique_relationships)} relationships")
    print(f"Removed {len(kg_data.get('entities', [])) - len(new_entities)} duplicate entities")
    print(f"Removed {len(kg_data.get('relationships', [])) - len(unique_relationships)} duplicate or invalid relationships")
    
    return cleaned_kg

# Convert knowledge graph to yFiles format
def prepare_visualization_data(kg_data):
    """Convert the knowledge graph data to yFiles format"""
    # Clean the knowledge graph
    cleaned_kg = clean_knowledge_graph(kg_data)
    
    # Prepare nodes
    nodes = []
    for entity in cleaned_kg.get("entities", []):
        entity_id = entity.get("id", "unknown")
        entity_name = entity.get("name", entity_id)
        entity_type = entity.get("type", "unknown")
        
        # Add node
        nodes.append({
            "id": entity_id,
            "properties": {
                "label": entity_name,
                "type": entity_type,
                **entity.get("attributes", {})
            }
        })
    
    # Prepare edges
    edges = []
    for i, rel in enumerate(cleaned_kg.get("relationships", [])):
        source = rel.get("source")
        target = rel.get("target")
        rel_type = rel.get("type", "related_to")
        
        # Skip relationships with missing source or target
        if not source or not target:
            continue
        
        # Skip self-references
        if source == target:
            continue
        
        # Add edge
        edges.append({
            "id": f"rel_{i}",
            "start": source,
            "end": target,
            "properties": {
                "label": rel_type,
                **rel.get("attributes", {})
            }
        })
    
    return nodes, edges

# Node mapping functions
def node_color_mapping(node):
    """Map node colors based on entity type"""
    entity_type = node.get("properties", {}).get("type", "unknown")
    return get_entity_color(entity_type)

def node_label_mapping(node):
    """Map node labels from the properties"""
    return node.get("properties", {}).get("label", str(node.get("id", "")))

def edge_label_mapping(edge):
    """Map edge labels from the properties"""
    return edge.get("properties", {}).get("label", "")

def node_type_mapping(node):
    """Map node types from the properties"""
    return node.get("properties", {}).get("type", "unknown")

def node_size_mapping(node):
    """Adjust node sizes based on entity type"""
    entity_type = node.get("properties", {}).get("type", "unknown").lower()
    if entity_type == "document":
        return (80, 60)
    elif entity_type == "person":
        return (70, 70)
    elif entity_type == "organization":
        return (85, 65)
    elif entity_type == "concept":
        return (75, 55)
    elif entity_type == "location":
        return (65, 50)
    elif entity_type == "time":
        return (60, 45)
    else:
        return (55, 45)

# Create and display the visualization
def visualize_knowledge_graph(kg_data=None):
    """Create and display a yFiles visualization of the knowledge graph"""
    # Load knowledge graph if not provided
    if not kg_data:
        kg_data = load_knowledge_graph()
        if not kg_data:
            print("No knowledge graph data available.")
            return None
    
    # Prepare data for visualization
    nodes, edges = prepare_visualization_data(kg_data)
    
    # Create widget
    w = GraphWidget()
    w.nodes = nodes
    w.edges = edges
    w.directed = True
    
    # Set custom mappings
    w.set_node_color_mapping(node_color_mapping)
    w.set_node_label_mapping(node_label_mapping)
    w.set_edge_label_mapping(edge_label_mapping)
    w.set_node_type_mapping(node_type_mapping)
    w.set_node_size_mapping(node_size_mapping)
    
    # Set the widget height and width
    w.layout.height = '800px'
    w.layout.width = '100%'
    
    # Apply an organic layout with larger spacing
    print("Applying organic layout...")
    w.organic_layout()
    
    # Display the widget
    print("Visualization ready!")
    return w

# Execute the visualization
widget = visualize_knowledge_graph()
display(widget)

Note: you may need to restart the kernel to use updated packages.
Loading knowledge graph from cia_ufo_output\ufo_ocr_kg_20250517_125107.json...
Loaded knowledge graph with 1193 entities and 1250 relationships
Cleaned knowledge graph: 1129 entities and 1187 relationships
Removed 64 duplicate entities
Removed 63 duplicate or invalid relationships
Applying organic layout...
Visualization ready!


GraphWidget(layout=Layout(height='800px', width='100%'))