# Knowledge Graph Wiki System - Demonstration Notebook

This notebook demonstrates the capabilities of the Knowledge Graph Wiki API and provides interactive exploration of the Wikipedia knowledge graph.

## Table of Contents
1. [Setup and Connection](#setup)
2. [Database Overview](#overview)
3. [Graph Exploration](#exploration)
4. [API Demonstrations](#api)
5. [Advanced Queries](#advanced)
6. [Visualizations](#visualizations)

## 1. Setup and Connection <a name="setup"></a>

In [None]:
# Import required libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from neo4j import GraphDatabase
import json

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

In [None]:
# API Configuration
API_BASE_URL = "http://localhost:8000"

# Neo4j Configuration
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

# Test API connection
response = requests.get(f"{API_BASE_URL}/health")
print(f"API Health Status: {response.json()}")

In [None]:
# Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def run_query(query, parameters=None):
    """Execute a Cypher query and return results."""
    with driver.session() as session:
        result = session.run(query, parameters or {})
        return [dict(record) for record in result]

print("Neo4j connection established!")

## 2. Database Overview <a name="overview"></a>

In [None]:
# Get overall statistics
stats_query = """
MATCH (a:Article) 
WITH count(a) as articles
MATCH (t:Topic) 
WITH articles, count(t) as topics
MATCH (au:Author) 
WITH articles, topics, count(au) as authors
MATCH ()-[r]->()
RETURN articles, topics, authors, count(r) as relationships
"""

stats = run_query(stats_query)[0]
print("\n=== Knowledge Graph Statistics ===")
print(f"Total Articles: {stats['articles']:,}")
print(f"Total Topics: {stats['topics']:,}")
print(f"Total Authors: {stats['authors']:,}")
print(f"Total Relationships: {stats['relationships']:,}")

In [None]:
# Visualize node distribution
node_types = ['Articles', 'Topics', 'Authors']
node_counts = [stats['articles'], stats['topics'], stats['authors']]

plt.figure(figsize=(10, 6))
plt.bar(node_types, node_counts, color=['#3498db', '#e74c3c', '#2ecc71'])
plt.title('Knowledge Graph Node Distribution', fontsize=16, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Node Type', fontsize=12)
for i, v in enumerate(node_counts):
    plt.text(i, v + max(node_counts)*0.02, f'{v:,}', ha='center', fontsize=10)
plt.tight_layout()
plt.show()

## 3. Graph Exploration <a name="exploration"></a>

In [None]:
# Find most connected articles
top_articles_query = """
MATCH (a:Article)
OPTIONAL MATCH (a)-[r]-()
WITH a, count(r) as degree
ORDER BY degree DESC
LIMIT 10
RETURN a.article_id as id, a.article_title as title, degree
"""

top_articles = pd.DataFrame(run_query(top_articles_query))
print("\n=== Top 10 Most Connected Articles ===")
print(top_articles.to_string(index=False))

In [None]:
# Visualize top articles
plt.figure(figsize=(12, 6))
plt.barh(top_articles['title'], top_articles['degree'], color='#3498db')
plt.xlabel('Number of Connections', fontsize=12)
plt.title('Top 10 Most Connected Articles', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Explore topic distribution
topic_query = """
MATCH (t:Topic)
OPTIONAL MATCH (a:Article)-[:HAS_TOPIC]->(t)
WITH t, count(a) as article_count
ORDER BY article_count DESC
LIMIT 15
RETURN t.topic_name as topic, article_count
"""

topics = pd.DataFrame(run_query(topic_query))
print("\n=== Top 15 Topics by Article Count ===")
print(topics.to_string(index=False))

In [None]:
# Visualize topic distribution
plt.figure(figsize=(12, 8))
plt.barh(topics['topic'], topics['article_count'], color='#e74c3c')
plt.xlabel('Number of Articles', fontsize=12)
plt.title('Top 15 Topics by Article Count', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 4. API Demonstrations <a name="api"></a>

In [None]:
# Test search endpoint
search_response = requests.post(
    f"{API_BASE_URL}/api/v1/search",
    json={"search_term": "organization", "limit": 5}
)

print("\n=== Search Results for 'organization' ===")
if search_response.status_code == 200:
    results = search_response.json()
    for i, result in enumerate(results, 1):
        print(f"\n{i}. {result.get('properties', {}).get('article_title', 'N/A')}")
        print(f"   ID: {result.get('id', 'N/A')}")
        print(f"   Labels: {', '.join(result.get('labels', []))}")
else:
    print(f"Error: {search_response.status_code}")

In [None]:
# Test analytics endpoint
analytics_response = requests.get(f"{API_BASE_URL}/api/v1/advanced/analytics?top_n=5")

print("\n=== Knowledge Graph Analytics ===")
if analytics_response.status_code == 200:
    analytics = analytics_response.json()
    print(f"Total Articles: {analytics.get('total_articles', 0):,}")
    print(f"Total Communities: {analytics.get('total_communities', 0):,}")
    print(f"Total Edges: {analytics.get('total_edges', 0):,}")
    print(f"Average Degree: {analytics.get('avg_degree', 0):.2f}")
    
    print("\nTop Communities:")
    for comm in analytics.get('top_communities', []):
        print(f"  - Community {comm.get('community_id')}: {comm.get('size')} articles")
else:
    print(f"Error: {analytics_response.status_code}")

## 5. Advanced Queries <a name="advanced"></a>

In [None]:
# Find articles with multiple topics
multi_topic_query = """
MATCH (a:Article)-[:HAS_TOPIC]->(t:Topic)
WITH a, collect(t.topic_name) as topics
WHERE size(topics) > 1
RETURN a.article_title as article, size(topics) as topic_count, topics
ORDER BY topic_count DESC
LIMIT 10
"""

multi_topic_articles = pd.DataFrame(run_query(multi_topic_query))
print("\n=== Articles with Multiple Topics ===")
if not multi_topic_articles.empty:
    for idx, row in multi_topic_articles.iterrows():
        print(f"\n{idx + 1}. {row['article']}")
        print(f"   Topic Count: {row['topic_count']}")
        print(f"   Topics: {', '.join(row['topics'][:5])}..." if len(row['topics']) > 5 else f"   Topics: {', '.join(row['topics'])}")
else:
    print("No articles found with multiple topics")

In [None]:
# Explore author contributions
author_query = """
MATCH (au:Author)-[:AUTHORED]->(a:Article)
WITH au, count(a) as articles
ORDER BY articles DESC
LIMIT 10
RETURN au.author_name as author, articles
"""

authors = pd.DataFrame(run_query(author_query))
print("\n=== Top 10 Most Prolific Authors ===")
if not authors.empty:
    print(authors.to_string(index=False))
else:
    print("No author data available")

## 6. Visualizations <a name="visualizations"></a>

In [None]:
# Visualize degree distribution
degree_query = """
MATCH (a:Article)
OPTIONAL MATCH (a)-[r]-()
WITH a, count(r) as degree
RETURN degree, count(a) as count
ORDER BY degree
"""

degree_dist = pd.DataFrame(run_query(degree_query))

if not degree_dist.empty:
    plt.figure(figsize=(12, 6))
    plt.plot(degree_dist['degree'], degree_dist['count'], marker='o', linewidth=2)
    plt.xlabel('Degree (Number of Connections)', fontsize=12)
    plt.ylabel('Number of Articles', fontsize=12)
    plt.title('Article Degree Distribution', fontsize=16, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Create a small subgraph visualization
subgraph_query = """
MATCH (a:Article)-[r:REFERS_TO]->(b:Article)
RETURN a.article_id as source, b.article_id as target, a.article_title as source_title, b.article_title as target_title
LIMIT 50
"""

edges = run_query(subgraph_query)

if edges:
    # Create NetworkX graph
    G = nx.DiGraph()
    
    for edge in edges:
        G.add_edge(edge['source'], edge['target'])
    
    # Visualize
    plt.figure(figsize=(14, 10))
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    
    nx.draw_networkx_nodes(G, pos, node_size=300, node_color='#3498db', alpha=0.7)
    nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5, arrows=True, arrowsize=10)
    
    plt.title('Sample Article Reference Network', fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print(f"\nNetwork Statistics:")
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
    print(f"Density: {nx.density(G):.4f}")

In [None]:
# Cleanup
driver.close()
print("\nDemo complete! Neo4j connection closed.")