## Introduction

This notebook demonstrates how to convert extracted entities and relationships into a populated Neo4j knowledge graph. You'll learn to:
- Convert extracted data to graph format
- Implement entity linking and deduplication
- Populate Neo4j database with nodes and relationships
- Validate graph construction quality
- Handle large-scale graph construction

**## Setup**

In [None]:
# Install and import all packages in one cell
!pip install -q neo4j pandas numpy matplotlib seaborn networkx plotly json5

# Import packages immediately after installation
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from typing import List, Dict, Any, Tuple, Set
from collections import defaultdict, Counter
import hashlib
import re
from datetime import datetime

# Neo4j imports with error handling
try:
    from neo4j import GraphDatabase
    print("‚úÖ Neo4j driver loaded successfully")
except Exception as e:
    print(f"‚ö†Ô∏è Neo4j driver issue: {e}")
    print("Neo4j operations will be simulated")

print("üöÄ Setup complete! Ready for graph construction.")

**## Part 1: Load Extracted Knowledge**

In [None]:
### Load Data from Previous Notebook

def load_extracted_knowledge():
    """Load the knowledge extracted from the previous notebook."""

    # Try to load the saved data from previous notebook
    try:
        with open('processed_knowledge_for_graph.json', 'r') as f:
            data = json.load(f)
        print("‚úÖ Loaded extracted knowledge from previous notebook")
        return data
    except FileNotFoundError:
        print("‚ö†Ô∏è Previous notebook data not found. Creating sample data...")
        return create_sample_data()

def create_sample_data():
    """Create sample extracted knowledge for demonstration."""

    sample_knowledge = {
        'extracted_knowledge': [
            {
                'document_id': 'paper_1',
                'document_title': 'Attention Is All You Need',
                'document_metadata': {
                    'authors': ['Ashish Vaswani', 'Noam Shazeer', 'Niki Parmar'],
                    'year': 2017,
                    'venue': 'NIPS'
                },
                'entities': [
                    {'text': 'Transformer', 'type': 'CONCEPT', 'context': 'network architecture'},
                    {'text': 'attention mechanisms', 'type': 'CONCEPT', 'context': 'based solely on'},
                    {'text': 'machine translation', 'type': 'CONCEPT', 'context': 'tasks'},
                    {'text': 'BLEU', 'type': 'METRIC', 'context': 'score'},
                    {'text': 'WMT 2014', 'type': 'DATASET', 'context': 'English-to-German'},
                    {'text': 'Ashish Vaswani', 'type': 'PERSON', 'context': 'author'},
                    {'text': 'P100 GPUs', 'type': 'TECHNOLOGY', 'context': 'eight P100 GPUs'}
                ],
                'relationships': [
                    {
                        'source': 'Transformer',
                        'target': 'attention mechanisms',
                        'relationship': 'BASED_ON',
                        'confidence': 0.9,
                        'evidence': 'based solely on attention mechanisms'
                    },
                    {
                        'source': 'Transformer',
                        'target': 'machine translation',
                        'relationship': 'EVALUATES_ON',
                        'confidence': 0.8,
                        'evidence': 'Experiments on two machine translation tasks'
                    },
                    {
                        'source': 'Transformer',
                        'target': 'BLEU',
                        'relationship': 'ACHIEVES',
                        'confidence': 0.9,
                        'evidence': 'achieves 28.4 BLEU'
                    },
                    {
                        'source': 'Ashish Vaswani',
                        'target': 'Attention Is All You Need',
                        'relationship': 'AUTHORED',
                        'confidence': 1.0,
                        'evidence': 'author of the paper'
                    }
                ]
            },
            {
                'document_id': 'paper_2',
                'document_title': 'BERT: Pre-training of Deep Bidirectional Transformers',
                'document_metadata': {
                    'authors': ['Jacob Devlin', 'Ming-Wei Chang'],
                    'year': 2018,
                    'venue': 'NAACL'
                },
                'entities': [
                    {'text': 'BERT', 'type': 'CONCEPT', 'context': 'language representation model'},
                    {'text': 'Bidirectional Encoder Representations', 'type': 'CONCEPT', 'context': 'full name'},
                    {'text': 'Transformers', 'type': 'CONCEPT', 'context': 'from Transformers'},
                    {'text': 'pre-training', 'type': 'CONCEPT', 'context': 'designed to pre-train'},
                    {'text': 'question answering', 'type': 'CONCEPT', 'context': 'tasks such as'},
                    {'text': 'Jacob Devlin', 'type': 'PERSON', 'context': 'author'}
                ],
                'relationships': [
                    {
                        'source': 'BERT',
                        'target': 'Transformers',
                        'relationship': 'BASED_ON',
                        'confidence': 0.9,
                        'evidence': 'Bidirectional Encoder Representations from Transformers'
                    },
                    {
                        'source': 'BERT',
                        'target': 'question answering',
                        'relationship': 'EVALUATES_ON',
                        'confidence': 0.8,
                        'evidence': 'tasks such as question answering'
                    },
                    {
                        'source': 'Jacob Devlin',
                        'target': 'BERT: Pre-training of Deep Bidirectional Transformers',
                        'relationship': 'AUTHORED',
                        'confidence': 1.0,
                        'evidence': 'author of the paper'
                    }
                ]
            }
        ],
        'extraction_metadata': {
            'total_entities': 13,
            'total_relationships': 7,
            'source_documents': 2
        }
    }

    return sample_knowledge

# Load the extracted knowledge
knowledge_data = load_extracted_knowledge()
extracted_knowledge = knowledge_data['extracted_knowledge']

print(f"üìä Loaded Knowledge Summary:")
print(f"   Documents: {len(extracted_knowledge)}")
print(f"   Total entities: {sum(len(doc.get('entities', [])) for doc in extracted_knowledge)}")
print(f"   Total relationships: {sum(len(doc.get('relationships', [])) for doc in extracted_knowledge)}")

**## Part 2: Entity Linking and Deduplication**

In [None]:
### Entity Normalization and Linking

class EntityLinker:
    """Handle entity normalization, linking, and deduplication."""

    def __init__(self):
        self.entity_mapping = {}  # Maps variations to canonical form
        self.canonical_entities = {}  # Stores canonical entity information
        self.similarity_threshold = 0.8

    def normalize_entity_text(self, text: str) -> str:
        """Normalize entity text for comparison."""
        # Convert to lowercase and remove extra whitespace
        normalized = re.sub(r'\s+', ' ', text.lower().strip())

        # Remove common suffixes/prefixes that don't affect meaning
        normalized = re.sub(r'\b(the|a|an)\b', '', normalized)
        normalized = normalized.strip()

        # Handle acronyms and full forms
        # e.g., "BERT" and "Bidirectional Encoder Representations from Transformers"
        if len(normalized) <= 10 and normalized.isupper():
            return normalized  # Keep acronyms as-is

        return normalized

    def calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity between two entity texts."""
        norm1 = self.normalize_entity_text(text1)
        norm2 = self.normalize_entity_text(text2)

        # Exact match
        if norm1 == norm2:
            return 1.0

        # Check if one is contained in the other (substring match)
        if norm1 in norm2 or norm2 in norm1:
            return 0.9

        # Check for acronym matches
        if self._is_acronym_match(text1, text2):
            return 0.95

        # Simple character-based similarity (Jaccard similarity)
        set1 = set(norm1.split())
        set2 = set(norm2.split())

        if not set1 or not set2:
            return 0.0

        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))

        return intersection / union if union > 0 else 0.0

    def _is_acronym_match(self, text1: str, text2: str) -> bool:
        """Check if one text is an acronym of the other."""
        short, long = (text1, text2) if len(text1) < len(text2) else (text2, text1)

        if len(short) <= 10 and short.isupper() and len(long) > 10:
            # Extract first letters of each word in the long form
            words = long.split()
            if len(words) >= len(short):
                acronym = ''.join([word[0].upper() for word in words if word])
                return short == acronym

        return False

    def find_canonical_entity(self, entity: Dict[str, Any]) -> str:
        """Find or create canonical form for an entity."""
        entity_text = entity['text']
        entity_type = entity['type']

        # Check if we already have a mapping for this exact text
        if entity_text in self.entity_mapping:
            return self.entity_mapping[entity_text]

        # Look for similar entities of the same type
        best_match = None
        best_similarity = 0.0

        for canonical_id, canonical_info in self.canonical_entities.items():
            if canonical_info['type'] == entity_type:
                similarity = self.calculate_similarity(entity_text, canonical_info['text'])
                if similarity > best_similarity and similarity >= self.similarity_threshold:
                    best_similarity = similarity
                    best_match = canonical_id

        if best_match:
            # Map this entity to existing canonical entity
            self.entity_mapping[entity_text] = best_match
            # Update canonical entity with additional context
            self.canonical_entities[best_match]['variants'].add(entity_text)
            return best_match
        else:
            # Create new canonical entity
            canonical_id = f"{entity_type.lower()}_{len(self.canonical_entities)}"
            self.canonical_entities[canonical_id] = {
                'id': canonical_id,
                'text': entity_text,
                'type': entity_type,
                'variants': {entity_text},
                'contexts': [entity.get('context', '')]
            }
            self.entity_mapping[entity_text] = canonical_id
            return canonical_id

    def link_entities(self, extracted_knowledge: List[Dict]) -> Dict[str, Any]:
        """Process all entities and create canonical mappings."""

        print("üîó Starting entity linking process...")

        # First pass: create canonical entities
        for doc in extracted_knowledge:
            for entity in doc.get('entities', []):
                canonical_id = self.find_canonical_entity(entity)
                # Add document context
                if canonical_id in self.canonical_entities:
                    self.canonical_entities[canonical_id]['contexts'].append(
                        f"From {doc['document_title']}: {entity.get('context', '')}"
                    )

        # Generate linking statistics
        total_entities = sum(len(doc.get('entities', [])) for doc in extracted_knowledge)
        unique_entities = len(self.canonical_entities)

        linking_stats = {
            'total_entities': total_entities,
            'unique_entities': unique_entities,
            'deduplication_ratio': (total_entities - unique_entities) / total_entities if total_entities > 0 else 0,
            'entity_mappings': len(self.entity_mapping)
        }

        print(f"‚úÖ Entity linking complete:")
        print(f"   Original entities: {total_entities}")
        print(f"   Unique entities: {unique_entities}")
        print(f"   Deduplication ratio: {linking_stats['deduplication_ratio']:.2%}")

        return {
            'canonical_entities': self.canonical_entities,
            'entity_mapping': self.entity_mapping,
            'linking_stats': linking_stats
        }

# Initialize entity linker and process entities
entity_linker = EntityLinker()
linking_results = entity_linker.link_entities(extracted_knowledge)

# Display some examples of linked entities
print("\nüîç Sample Canonical Entities:")
for i, (canonical_id, entity_info) in enumerate(list(linking_results['canonical_entities'].items())[:5]):
    print(f"   {i+1}. {canonical_id}: {entity_info['text']} ({entity_info['type']})")
    if len(entity_info['variants']) > 1:
        print(f"      Variants: {', '.join(entity_info['variants'])}")

**### Relationship Normalization**

In [None]:
class RelationshipProcessor:
    """Process and normalize relationships for graph construction."""

    def __init__(self, entity_mapping: Dict[str, str]):
        self.entity_mapping = entity_mapping
        self.processed_relationships = []
        self.relationship_stats = defaultdict(int)

    def normalize_relationship(self, relationship: Dict[str, Any], doc_id: str) -> Dict[str, Any]:
        """Normalize a relationship using canonical entity mappings."""

        source_text = relationship.get('source', '')
        target_text = relationship.get('target', '')

        # Map to canonical entities
        source_canonical = self.entity_mapping.get(source_text, source_text)
        target_canonical = self.entity_mapping.get(target_text, target_text)

        # Skip self-relationships
        if source_canonical == target_canonical:
            return None

        normalized_rel = {
            'source': source_canonical,
            'target': target_canonical,
            'relationship': relationship.get('relationship', 'RELATED'),
            'confidence': relationship.get('confidence', 0.5),
            'evidence': relationship.get('evidence', ''),
            'source_document': doc_id,
            'original_source': source_text,
            'original_target': target_text
        }

        return normalized_rel

    def process_relationships(self, extracted_knowledge: List[Dict]) -> List[Dict]:
        """Process all relationships from extracted knowledge."""

        print("üîÑ Processing relationships...")

        all_relationships = []

        for doc in extracted_knowledge:
            doc_id = doc['document_id']

            for relationship in doc.get('relationships', []):
                normalized = self.normalize_relationship(relationship, doc_id)
                if normalized:
                    all_relationships.append(normalized)
                    self.relationship_stats[normalized['relationship']] += 1

        # Remove duplicate relationships (same source, target, relationship type)
        unique_relationships = []
        seen_relationships = set()

        for rel in all_relationships:
            rel_key = (rel['source'], rel['target'], rel['relationship'])
            if rel_key not in seen_relationships:
                unique_relationships.append(rel)
                seen_relationships.add(rel_key)
            else:
                # Merge confidence scores for duplicates
                for existing_rel in unique_relationships:
                    if (existing_rel['source'], existing_rel['target'], existing_rel['relationship']) == rel_key:
                        existing_rel['confidence'] = max(existing_rel['confidence'], rel['confidence'])
                        break

        print(f"‚úÖ Relationship processing complete:")
        print(f"   Total relationships: {len(all_relationships)}")
        print(f"   Unique relationships: {len(unique_relationships)}")
        print(f"   Relationship types: {len(self.relationship_stats)}")

        return unique_relationships

# Process relationships using the canonical entity mappings
relationship_processor = RelationshipProcessor(linking_results['entity_mapping'])
processed_relationships = relationship_processor.process_relationships(extracted_knowledge)

# Display relationship statistics
print(f"\nüìä Relationship Type Distribution:")
relationship_counter = Counter()
for rel_type, count in relationship_processor.relationship_stats.items():
    relationship_counter[rel_type] = count

for rel_type, count in relationship_counter.most_common():
    print(f"   {rel_type}: {count}")

**## Part 3: Neo4j Database Population**

In [None]:
### Neo4j Connection Management

class Neo4jGraphConstructor:
    """Manage Neo4j database connection and graph construction."""

    def __init__(self, uri: str = None, user: str = None, password: str = None):
        self.uri = uri
        self.user = user
        self.password = password
        self.driver = None
        self.use_real_neo4j = False

        # Try to connect to Neo4j
        if uri and user and password:
            try:
                self.driver = GraphDatabase.driver(uri, auth=(user, password))
                # Test connection
                with self.driver.session() as session:
                    session.run("RETURN 1")
                self.use_real_neo4j = True
                print("‚úÖ Connected to Neo4j database")
            except Exception as e:
                print(f"‚ö†Ô∏è Neo4j connection failed: {e}")
                print("Will simulate Neo4j operations")
        else:
            print("üìù No Neo4j credentials provided. Simulating operations.")

    def execute_query(self, query: str, parameters: Dict = None) -> List[Dict]:
        """Execute a Cypher query."""
        if self.use_real_neo4j and self.driver:
            try:
                with self.driver.session() as session:
                    result = session.run(query, parameters or {})
                    return [record.data() for record in result]
            except Exception as e:
                print(f"‚ùå Query execution error: {e}")
                return []
        else:
            # Simulate query execution
            print(f"üîç Simulated Query: {query[:100]}...")
            if parameters:
                print(f"   Parameters: {list(parameters.keys())}")
            return [{"status": "simulated"}]

    def clear_database(self):
        """Clear the Neo4j database."""
        query = "MATCH (n) DETACH DELETE n"
        result = self.execute_query(query)
        print("üóëÔ∏è Database cleared")
        return result

    def create_indexes(self):
        """Create indexes for better performance."""
        indexes = [
            "CREATE INDEX entity_id_index IF NOT EXISTS FOR (e:Entity) ON (e.id)",
            "CREATE INDEX document_id_index IF NOT EXISTS FOR (d:Document) ON (d.id)",
            "CREATE INDEX entity_type_index IF NOT EXISTS FOR (e:Entity) ON (e.type)"
        ]

        for index_query in indexes:
            self.execute_query(index_query)

        print("üìä Created database indexes")

    def close(self):
        """Close the database connection."""
        if self.driver:
            self.driver.close()
            print("Connection closed")

# Initialize Neo4j constructor
# Replace with your actual Neo4j credentials
NEO4J_URI = "neo4j+s://your-instance.databases.neo4j.io"  # Your Neo4j URI
NEO4J_USERNAME = "neo4j"                                   # Usually 'neo4j'
NEO4J_PASSWORD = "your-password-here"                      # Your password

# For demonstration, we'll use simulation mode
# Set these to your actual credentials to use real Neo4j
neo4j_constructor = Neo4jGraphConstructor()  # No credentials = simulation mode

# Uncomment and set your credentials for real Neo4j usage:
# neo4j_constructor = Neo4jGraphConstructor(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

# Clear database and create indexes
neo4j_constructor.clear_database()
neo4j_constructor.create_indexes()

**### Document Node Creation**

In [None]:
def create_document_nodes(extracted_knowledge: List[Dict], neo4j_constructor: Neo4jGraphConstructor):
    """Create document nodes in Neo4j."""

    print("üìÑ Creating document nodes...")

    document_creation_query = """
    UNWIND $documents as doc
    CREATE (d:Document {
        id: doc.id,
        title: doc.title,
        authors: doc.authors,
        year: doc.year,
        venue: doc.venue,
        created_at: datetime()
    })
    """

    # Prepare document data
    documents = []
    for doc in extracted_knowledge:
        doc_data = {
            'id': doc['document_id'],
            'title': doc['document_title'],
            'authors': doc.get('document_metadata', {}).get('authors', []),
            'year': doc.get('document_metadata', {}).get('year', 0),
            'venue': doc.get('document_metadata', {}).get('venue', '')
        }
        documents.append(doc_data)

    # Execute document creation
    result = neo4j_constructor.execute_query(document_creation_query, {'documents': documents})

    print(f"‚úÖ Created {len(documents)} document nodes")
    return result

# Create document nodes
doc_creation_result = create_document_nodes(extracted_knowledge, neo4j_constructor)

**### Entity Node Creation**

In [None]:
def create_entity_nodes(canonical_entities: Dict[str, Dict], neo4j_constructor: Neo4jGraphConstructor):
    """Create entity nodes in Neo4j."""

    print("üè∑Ô∏è Creating entity nodes...")

    entity_creation_query = """
    UNWIND $entities as entity
    CREATE (e:Entity {
        id: entity.id,
        text: entity.text,
        type: entity.type,
        variants: entity.variants,
        contexts: entity.contexts,
        created_at: datetime()
    })
    """

    # Prepare entity data
    entities = []
    for canonical_id, entity_info in canonical_entities.items():
        entity_data = {
            'id': canonical_id,
            'text': entity_info['text'],
            'type': entity_info['type'],
            'variants': list(entity_info['variants']),
            'contexts': entity_info['contexts'][:5]  # Limit contexts to avoid too much data
        }
        entities.append(entity_data)

    # Execute entity creation in batches to avoid memory issues
    batch_size = 100
    total_created = 0

    for i in range(0, len(entities), batch_size):
        batch = entities[i:i + batch_size]
        result = neo4j_constructor.execute_query(entity_creation_query, {'entities': batch})
        total_created += len(batch)
        print(f"   Created batch {i//batch_size + 1}: {len(batch)} entities")

    print(f"‚úÖ Created {total_created} entity nodes")
    return total_created

# Create entity nodes
entity_creation_result = create_entity_nodes(linking_results['canonical_entities'], neo4j_constructor)

**### Relationship Creation**

In [None]:
def create_relationships(processed_relationships: List[Dict], neo4j_constructor: Neo4jGraphConstructor):
    """Create relationships in Neo4j."""

    print("üîó Creating relationships...")

    # Group relationships by type for efficient creation
    relationships_by_type = defaultdict(list)
    for rel in processed_relationships:
        relationships_by_type[rel['relationship']].append(rel)

    total_created = 0

    for rel_type, relationships in relationships_by_type.items():
        print(f"   Creating {len(relationships)} {rel_type} relationships...")

        # Create relationship query
        relationship_query = f"""
        UNWIND $relationships as rel
        MATCH (source:Entity {{id: rel.source}})
        MATCH (target:Entity {{id: rel.target}})
        CREATE (source)-[r:{rel_type} {{
            confidence: rel.confidence,
            evidence: rel.evidence,
            source_document: rel.source_document,
            created_at: datetime()
        }}]->(target)
        """

        # Execute in batches
        batch_size = 50
        for i in range(0, len(relationships), batch_size):
            batch = relationships[i:i + batch_size]
            try:
                neo4j_constructor.execute_query(relationship_query, {'relationships': batch})
                total_created += len(batch)
            except Exception as e:
                print(f"   ‚ö†Ô∏è Error creating {rel_type} relationships: {e}")

    print(f"‚úÖ Created {total_created} relationships")
    return total_created

# Create relationships
relationship_creation_result = create_relationships(processed_relationships, neo4j_constructor)

**### Document-Entity Connections**

In [None]:
def create_document_entity_connections(extracted_knowledge: List[Dict],
                                     entity_mapping: Dict[str, str],
                                     neo4j_constructor: Neo4jGraphConstructor):
    """Create CONTAINS relationships between documents and entities."""

    print("üìã Creating document-entity connections...")

    connection_query = """
    UNWIND $connections as conn
    MATCH (doc:Document {id: conn.document_id})
    MATCH (entity:Entity {id: conn.entity_id})
    CREATE (doc)-[:CONTAINS {
        context: conn.context,
        created_at: datetime()
    }]->(entity)
    """

    # Prepare connection data
    connections = []
    for doc in extracted_knowledge:
        doc_id = doc['document_id']

        for entity in doc.get('entities', []):
            canonical_id = entity_mapping.get(entity['text'], entity['text'])

            connection_data = {
                'document_id': doc_id,
                'entity_id': canonical_id,
                'context': entity.get('context', '')
            }
            connections.append(connection_data)

    # Execute in batches
    batch_size = 100
    total_created = 0

    for i in range(0, len(connections), batch_size):
        batch = connections[i:i + batch_size]
        neo4j_constructor.execute_query(connection_query, {'connections': batch})
        total_created += len(batch)

    print(f"‚úÖ Created {total_created} document-entity connections")
    return total_created

# Create document-entity connections
connection_result = create_document_entity_connections(
    extracted_knowledge,
    linking_results['entity_mapping'],
    neo4j_constructor
)

**## Part 4: Graph Validation and Quality Assessment**

In [None]:
### Graph Statistics and Validation

class GraphValidator:
    """Validate the constructed knowledge graph."""

    def __init__(self, neo4j_constructor: Neo4jGraphConstructor):
        self.neo4j = neo4j_constructor

    def get_graph_statistics(self) -> Dict[str, Any]:
        """Get comprehensive graph statistics."""

        print("üìä Collecting graph statistics...")

        # Basic node and relationship counts
        node_count_query = "MATCH (n) RETURN labels(n)[0] as label, count(n) as count"
        node_counts = self.neo4j.execute_query(node_count_query)

        rel_count_query = "MATCH ()-[r]->() RETURN type(r) as type, count(r) as count"
        rel_counts = self.neo4j.execute_query(rel_count_query)

        # Entity type distribution
        entity_type_query = "MATCH (e:Entity) RETURN e.type as type, count(e) as count"
        entity_types = self.neo4j.execute_query(entity_type_query)

        # Connectivity statistics
        connectivity_query = """
        MATCH (n)
        OPTIONAL MATCH (n)-[r]-()
        RETURN labels(n)[0] as node_type,
               count(DISTINCT n) as nodes,
               count(r) as total_relationships,
               count(r) * 1.0 / count(DISTINCT n) as avg_degree
        """
        connectivity_stats = self.neo4j.execute_query(connectivity_query)

        # Most connected entities
        top_entities_query = """
        MATCH (e:Entity)
        OPTIONAL MATCH (e)-[r]-()
        RETURN e.text as entity, e.type as type, count(r) as degree
        ORDER BY degree DESC
        LIMIT 10
        """
        top_entities = self.neo4j.execute_query(top_entities_query)

        # Isolated nodes (nodes with no relationships)
        isolated_nodes_query = """
        MATCH (n)
        WHERE NOT (n)-[]-()
        RETURN labels(n)[0] as type, count(n) as count
        """
        isolated_nodes = self.neo4j.execute_query(isolated_nodes_query)

        stats = {
            'node_counts': node_counts,
            'relationship_counts': rel_counts,
            'entity_types': entity_types,
            'connectivity': connectivity_stats,
            'top_entities': top_entities,
            'isolated_nodes': isolated_nodes
        }

        return stats

    def validate_graph_quality(self) -> Dict[str, Any]:
        """Validate graph construction quality."""

        print("üîç Validating graph quality...")

        validation_results = {
            'issues': [],
            'warnings': [],
            'quality_score': 0.0
        }

        stats = self.get_graph_statistics()

        # Check for isolated nodes
        isolated_count = sum(item.get('count', 0) for item in stats.get('isolated_nodes', []))
        total_nodes = sum(item.get('count', 0) for item in stats.get('node_counts', []))

        if isolated_count > 0:
            isolation_ratio = isolated_count / total_nodes if total_nodes > 0 else 0
            if isolation_ratio > 0.3:
                validation_results['issues'].append(f"High isolation ratio: {isolation_ratio:.2%} of nodes are isolated")
            elif isolation_ratio > 0.1:
                validation_results['warnings'].append(f"Moderate isolation ratio: {isolation_ratio:.2%} of nodes are isolated")

        # Check relationship distribution
        rel_counts = {item.get('type', 'UNKNOWN'): item.get('count', 0) for item in stats.get('relationship_counts', [])}
        total_rels = sum(rel_counts.values())

        if total_rels == 0:
            validation_results['issues'].append("No relationships found in graph")
        else:
            # Check for relationship diversity
            rel_types = len(rel_counts)
            if rel_types < 3:
                validation_results['warnings'].append(f"Low relationship diversity: only {rel_types} relationship types")

        # Check entity type distribution
        entity_types = {item.get('type', 'UNKNOWN'): item.get('count', 0) for item in stats.get('entity_types', [])}
        if len(entity_types) < 3:
            validation_results['warnings'].append(f"Low entity type diversity: only {len(entity_types)} entity types")

        # Calculate quality score
        quality_factors = []

        # Factor 1: Connection density (lower isolation = higher quality)
        if total_nodes > 0:
            connection_factor = 1.0 - (isolated_count / total_nodes)
            quality_factors.append(connection_factor * 0.3)

        # Factor 2: Relationship diversity
        if total_rels > 0:
            diversity_factor = min(len(rel_counts) / 5.0, 1.0)  # Normalize to max 5 types
            quality_factors.append(diversity_factor * 0.3)

        # Factor 3: Entity type diversity
        entity_diversity_factor = min(len(entity_types) / 6.0, 1.0)  # Normalize to max 6 types
        quality_factors.append(entity_diversity_factor * 0.2)

        # Factor 4: Graph size (more nodes = potentially higher quality, up to a point)
        size_factor = min(total_nodes / 100.0, 1.0)  # Normalize to 100 nodes
        quality_factors.append(size_factor * 0.2)

        validation_results['quality_score'] = sum(quality_factors)

        return validation_results

    def print_validation_report(self, stats: Dict, validation: Dict):
        """Print a comprehensive validation report."""

        print("\n" + "="*60)
        print("üìã GRAPH CONSTRUCTION VALIDATION REPORT")
        print("="*60)

        # Basic statistics
        print("\nüìä Graph Statistics:")
        total_nodes = sum(item.get('count', 0) for item in stats.get('node_counts', []))
        total_rels = sum(item.get('count', 0) for item in stats.get('relationship_counts', []))

        print(f"   Total Nodes: {total_nodes}")
        print(f"   Total Relationships: {total_rels}")

        print("\n   Node Distribution:")
        for item in stats.get('node_counts', []):
            print(f"      {item.get('label', 'Unknown')}: {item.get('count', 0)}")

        print("\n   Relationship Distribution:")
        for item in stats.get('relationship_counts', []):
            print(f"      {item.get('type', 'Unknown')}: {item.get('count', 0)}")

        print("\n   Entity Type Distribution:")
        for item in stats.get('entity_types', []):
            print(f"      {item.get('type', 'Unknown')}: {item.get('count', 0)}")

        # Top connected entities
        print("\nüîó Most Connected Entities:")
        for item in stats.get('top_entities', [])[:5]:
            print(f"      {item.get('entity', 'Unknown')} ({item.get('type', 'Unknown')}): {item.get('degree', 0)} connections")

        # Quality assessment
        print(f"\nüéØ Quality Assessment:")
        print(f"   Overall Quality Score: {validation['quality_score']:.2f}/1.0")

        if validation['issues']:
            print(f"\n‚ùå Issues Found:")
            for issue in validation['issues']:
                print(f"      ‚Ä¢ {issue}")

        if validation['warnings']:
            print(f"\n‚ö†Ô∏è Warnings:")
            for warning in validation['warnings']:
                print(f"      ‚Ä¢ {warning}")

        if not validation['issues'] and not validation['warnings']:
            print(f"   ‚úÖ No issues detected!")

        print("\n" + "="*60)

# Validate the constructed graph
validator = GraphValidator(neo4j_constructor)
graph_stats = validator.get_graph_statistics()
validation_results = validator.validate_graph_quality()

# Print validation report
validator.print_validation_report(graph_stats, validation_results)

**## Part 5: Graph Visualization and Analysis**

In [None]:
### NetworkX Graph Creation for Visualization

def create_networkx_graph(canonical_entities: Dict, processed_relationships: List[Dict]) -> nx.Graph:
    """Create a NetworkX graph for visualization."""

    print("üé® Creating NetworkX graph for visualization...")

    G = nx.Graph()

    # Add nodes
    for entity_id, entity_info in canonical_entities.items():
        G.add_node(entity_id,
                  text=entity_info['text'],
                  type=entity_info['type'],
                  variants=len(entity_info['variants']))

    # Add edges
    for rel in processed_relationships:
        if rel['source'] in G.nodes and rel['target'] in G.nodes:
            G.add_edge(rel['source'], rel['target'],
                      relationship=rel['relationship'],
                      confidence=rel['confidence'])

    print(f"‚úÖ Created NetworkX graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    return G

def visualize_graph_structure(G: nx.Graph):
    """Visualize the graph structure using matplotlib."""

    plt.figure(figsize=(15, 10))

    # Create layout
    pos = nx.spring_layout(G, k=2, iterations=50)

    # Color nodes by type
    entity_types = list(set(nx.get_node_attributes(G, 'type').values()))
    colors = plt.cm.Set3(np.linspace(0, 1, len(entity_types)))
    type_color_map = dict(zip(entity_types, colors))

    node_colors = [type_color_map.get(G.nodes[node].get('type', 'UNKNOWN'), 'gray')
                   for node in G.nodes()]

    # Draw the graph
    nx.draw(G, pos,
            node_color=node_colors,
            node_size=300,
            font_size=8,
            font_weight='bold',
            edge_color='gray',
            alpha=0.7,
            with_labels=False)

    # Add node labels
    labels = {node: G.nodes[node].get('text', node)[:15] for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels, font_size=6)

    # Create legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=type_color_map[entity_type], label=entity_type)
                      for entity_type in entity_types]
    plt.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.title("Knowledge Graph Structure", size=16, weight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

def analyze_graph_metrics(G: nx.Graph):
    """Analyze graph metrics and properties."""

    print("üìà GRAPH ANALYSIS METRICS")
    print("="*40)

    # Basic metrics
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
    print(f"Density: {nx.density(G):.4f}")

    # Connectivity
    if G.number_of_nodes() > 0:
        if nx.is_connected(G):
            print("Graph is connected")
            print(f"Diameter: {nx.diameter(G)}")
            print(f"Average shortest path length: {nx.average_shortest_path_length(G):.2f}")
        else:
            components = list(nx.connected_components(G))
            print(f"Graph has {len(components)} connected components")
            largest_component_size = max(len(comp) for comp in components)
            print(f"Largest component size: {largest_component_size}")

    # Centrality measures
    if G.number_of_nodes() > 0:
        degree_centrality = nx.degree_centrality(G)
        betweenness_centrality = nx.betweenness_centrality(G)

        print(f"\nTop 5 nodes by degree centrality:")
        sorted_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
        for node, centrality in sorted_degree[:5]:
            node_text = G.nodes[node].get('text', node)
            print(f"   {node_text}: {centrality:.3f}")

        print(f"\nTop 5 nodes by betweenness centrality:")
        sorted_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
        for node, centrality in sorted_betweenness[:5]:
            node_text = G.nodes[node].get('text', node)
            print(f"   {node_text}: {centrality:.3f}")

# Create and analyze NetworkX graph
nx_graph = create_networkx_graph(linking_results['canonical_entities'], processed_relationships)
visualize_graph_structure(nx_graph)
analyze_graph_metrics(nx_graph)

**### Interactive Graph Visualization**

In [None]:
def create_interactive_graph_visualization(G: nx.Graph):
    """Create an interactive graph visualization using Plotly."""

    print("üñ•Ô∏è Creating interactive graph visualization...")

    # Create layout
    pos = nx.spring_layout(G, k=2, iterations=50)

    # Prepare edge traces
    edge_x = []
    edge_y = []
    edge_info = []

    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

        # Get edge information
        edge_data = G.edges[edge]
        relationship = edge_data.get('relationship', 'RELATED')
        confidence = edge_data.get('confidence', 0.5)
        edge_info.append(f"{relationship} (confidence: {confidence:.2f})")

    edge_trace = go.Scatter(x=edge_x, y=edge_y,
                            line=dict(width=1, color='gray'),
                            hoverinfo='none',
                            mode='lines')

    # Prepare node traces by type
    entity_types = list(set(nx.get_node_attributes(G, 'type').values()))
    colors = px.colors.qualitative.Set3[:len(entity_types)]
    type_color_map = dict(zip(entity_types, colors))

    node_traces = []

    for entity_type in entity_types:
        # Get nodes of this type
        type_nodes = [node for node in G.nodes()
                     if G.nodes[node].get('type') == entity_type]

        if not type_nodes:
            continue

        node_x = [pos[node][0] for node in type_nodes]
        node_y = [pos[node][1] for node in type_nodes]

        # Create hover text
        hover_text = []
        for node in type_nodes:
            node_data = G.nodes[node]
            text = node_data.get('text', node)
            variants = node_data.get('variants', 1)
            degree = G.degree[node]

            hover_info = f"<b>{text}</b><br>"
            hover_info += f"Type: {entity_type}<br>"
            hover_info += f"Variants: {variants}<br>"
            hover_info += f"Connections: {degree}"
            hover_text.append(hover_info)

        # Create trace for this entity type
        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            hoverinfo='text',
            text=[G.nodes[node].get('text', node)[:10] for node in type_nodes],
            textposition="middle center",
            hovertext=hover_text,
            marker=dict(
                size=10,
                color=type_color_map[entity_type],
                line=dict(width=2, color='white')
            ),
            name=entity_type
        )
        node_traces.append(node_trace)

    # Create the figure
    fig = go.Figure(data=[edge_trace] + node_traces,
                   layout=go.Layout(
                        title='Interactive Knowledge Graph',
                        titlefont_size=16,
                        showlegend=True,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[ dict(
                            text="Hover over nodes for details",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002,
                            xanchor="left", yanchor="bottom",
                            font=dict(color="gray", size=12)
                        )],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                   )

    fig.show()
    print("‚úÖ Interactive visualization created!")

# Create interactive visualization
if nx_graph.number_of_nodes() > 0:
    create_interactive_graph_visualization(nx_graph)
else:
    print("‚ö†Ô∏è Graph is empty, skipping interactive visualization")

**## Part 6: Large-Scale Graph Construction Considerations**

In [None]:
### Batch Processing for Large Datasets

class LargeScaleGraphConstructor:
    """Handle large-scale graph construction with batching and optimization."""

    def __init__(self, neo4j_constructor: Neo4jGraphConstructor, batch_size: int = 1000):
        self.neo4j = neo4j_constructor
        self.batch_size = batch_size
        self.construction_stats = {
            'entities_processed': 0,
            'relationships_processed': 0,
            'batches_completed': 0,
            'errors': []
        }

    def construct_graph_in_batches(self, extracted_knowledge: List[Dict],
                                  entity_linker: EntityLinker) -> Dict[str, Any]:
        """Construct graph in batches for large datasets."""

        print(f"üèóÔ∏è Starting large-scale graph construction (batch size: {self.batch_size})")

        # Step 1: Process entities in batches
        all_entities = []
        for doc in extracted_knowledge:
            for entity in doc.get('entities', []):
                all_entities.append((entity, doc['document_id']))

        print(f"üìä Total entities to process: {len(all_entities)}")

        # Process entities in batches
        entity_mapping = {}
        canonical_entities = {}

        for i in range(0, len(all_entities), self.batch_size):
            batch = all_entities[i:i + self.batch_size]
            print(f"   Processing entity batch {i//self.batch_size + 1}")

            # Process this batch through entity linker
            batch_docs = [{'entities': [entity for entity, _ in batch]}]
            batch_linking = entity_linker.link_entities(batch_docs)

            # Merge results
            entity_mapping.update(batch_linking['entity_mapping'])
            canonical_entities.update(batch_linking['canonical_entities'])

            self.construction_stats['entities_processed'] += len(batch)
            self.construction_stats['batches_completed'] += 1

        # Step 2: Create nodes in batches
        self._create_nodes_in_batches(canonical_entities)

        # Step 3: Process relationships in batches
        all_relationships = []
        for doc in extracted_knowledge:
            for rel in doc.get('relationships', []):
                all_relationships.append((rel, doc['document_id']))

        print(f"üìä Total relationships to process: {len(all_relationships)}")

        # Process relationships in batches
        relationship_processor = RelationshipProcessor(entity_mapping)
        processed_rels = []

        for i in range(0, len(all_relationships), self.batch_size):
            batch = all_relationships[i:i + self.batch_size]
            print(f"   Processing relationship batch {i//self.batch_size + 1}")

            for rel, doc_id in batch:
                normalized = relationship_processor.normalize_relationship(rel, doc_id)
                if normalized:
                    processed_rels.append(normalized)

            self.construction_stats['relationships_processed'] += len(batch)

        # Step 4: Create relationships in batches
        self._create_relationships_in_batches(processed_rels)

        print(f"‚úÖ Large-scale construction complete!")
        print(f"   Entities processed: {self.construction_stats['entities_processed']}")
        print(f"   Relationships processed: {self.construction_stats['relationships_processed']}")
        print(f"   Batches completed: {self.construction_stats['batches_completed']}")

        return {
            'entity_mapping': entity_mapping,
            'canonical_entities': canonical_entities,
            'processed_relationships': processed_rels,
            'construction_stats': self.construction_stats
        }

    def _create_nodes_in_batches(self, canonical_entities: Dict):
        """Create entity nodes in batches."""
        entities_list = list(canonical_entities.items())

        for i in range(0, len(entities_list), self.batch_size):
            batch = dict(entities_list[i:i + self.batch_size])
            create_entity_nodes(batch, self.neo4j)

    def _create_relationships_in_batches(self, relationships: List[Dict]):
        """Create relationships in batches."""
        for i in range(0, len(relationships), self.batch_size):
            batch = relationships[i:i + self.batch_size]
            create_relationships(batch, self.neo4j)

# Demonstrate large-scale construction (on our sample data)
large_scale_constructor = LargeScaleGraphConstructor(neo4j_constructor, batch_size=10)

# Note: For demonstration with small sample data
print("üìù Large-scale construction demo (with small sample data):")
print("   In production, this would handle thousands of entities efficiently")
print("   Key benefits: Memory management, progress tracking, error recovery")

**## Part 7: Export and Integration**

In [None]:
### Export Functions for Other Tools

class GraphExporter:
    """Export constructed knowledge graph to various formats."""

    def __init__(self, neo4j_constructor: Neo4jGraphConstructor):
        self.neo4j = neo4j_constructor

    def export_to_csv(self, output_prefix: str = "knowledge_graph"):
        """Export graph data to CSV files."""

        print(f"üì§ Exporting graph to CSV files...")

        # Export entities
        entities_query = """
        MATCH (e:Entity)
        RETURN e.id as id, e.text as text, e.type as type,
               e.variants as variants, size(e.contexts) as context_count
        """
        entities_data = self.neo4j.execute_query(entities_query)

        if entities_data and entities_data[0].get('status') != 'simulated':
            entities_df = pd.DataFrame(entities_data)
            entities_df.to_csv(f"{output_prefix}_entities.csv", index=False)
            print(f"   ‚úÖ Exported {len(entities_df)} entities to {output_prefix}_entities.csv")

        # Export relationships
        relationships_query = """
        MATCH (s)-[r]->(t)
        RETURN s.text as source, t.text as target, type(r) as relationship,
               r.confidence as confidence, r.source_document as source_document
        """
        relationships_data = self.neo4j.execute_query(relationships_query)

        if relationships_data and relationships_data[0].get('status') != 'simulated':
            relationships_df = pd.DataFrame(relationships_data)
            relationships_df.to_csv(f"{output_prefix}_relationships.csv", index=False)
            print(f"   ‚úÖ Exported {len(relationships_df)} relationships to {output_prefix}_relationships.csv")

        # Export documents
        documents_query = """
        MATCH (d:Document)
        RETURN d.id as id, d.title as title, d.authors as authors,
               d.year as year, d.venue as venue
        """
        documents_data = self.neo4j.execute_query(documents_query)

        if documents_data and documents_data[0].get('status') != 'simulated':
            documents_df = pd.DataFrame(documents_data)
            documents_df.to_csv(f"{output_prefix}_documents.csv", index=False)
            print(f"   ‚úÖ Exported {len(documents_df)} documents to {output_prefix}_documents.csv")

    def export_for_graph_rag(self, filename: str = "graph_rag_data.json"):
        """Export graph data specifically formatted for Graph RAG systems."""

        print(f"üéØ Exporting data for Graph RAG system...")

        # Get all entities with their connections
        entities_with_connections_query = """
        MATCH (e:Entity)
        OPTIONAL MATCH (e)-[r]-(connected)
        RETURN e.id as id, e.text as text, e.type as type,
               count(r) as connection_count,
               collect(DISTINCT type(r)) as relationship_types
        """
        entities_data = self.neo4j.execute_query(entities_with_connections_query)

        # Get relationship paths for multi-hop reasoning
        paths_query = """
        MATCH path = (s:Entity)-[*1..3]-(t:Entity)
        WHERE s <> t
        RETURN s.text as start, t.text as end,
               [rel in relationships(path) | type(rel)] as path_types,
               length(path) as path_length
        LIMIT 100
        """
        paths_data = self.neo4j.execute_query(paths_query)

        graph_rag_export = {
            'entities': entities_data if entities_data[0].get('status') != 'simulated' else [],
            'reasoning_paths': paths_data if paths_data[0].get('status') != 'simulated' else [],
            'export_metadata': {
                'export_timestamp': datetime.now().isoformat(),
                'graph_type': 'knowledge_graph',
                'intended_use': 'graph_rag_retrieval'
            }
        }

        with open(filename, 'w') as f:
            json.dump(graph_rag_export, f, indent=2)

        print(f"‚úÖ Graph RAG data exported to {filename}")
        return graph_rag_export

# Export the constructed graph
exporter = GraphExporter(neo4j_constructor)
exporter.export_to_csv("constructed_knowledge_graph")
graph_rag_data = exporter.export_for_graph_rag()

**## Part 8: Summary and Next Steps**

In [None]:
### Construction Pipeline Summary

def summarize_graph_construction():
    """Summarize the graph construction pipeline and results."""

    print("üìã GRAPH CONSTRUCTION PIPELINE SUMMARY")
    print("="*60)

    pipeline_steps = [
        "1. Load extracted knowledge from previous notebook",
        "2. Entity linking and deduplication",
        "3. Relationship normalization and processing",
        "4. Neo4j database population (documents, entities, relationships)",
        "5. Graph validation and quality assessment",
        "6. Visualization and analysis",
        "7. Export for Graph RAG integration"
    ]

    print("üîÑ Pipeline Steps:")
    for step in pipeline_steps:
        print(f"   {step}")

    print(f"\nüìä Construction Results:")
    if linking_results:
        print(f"   Unique Entities: {len(linking_results['canonical_entities'])}")
        print(f"   Entity Deduplication Ratio: {linking_results['linking_stats']['deduplication_ratio']:.2%}")

    if processed_relationships:
        print(f"   Processed Relationships: {len(processed_relationships)}")

    if validation_results:
        print(f"   Graph Quality Score: {validation_results['quality_score']:.2f}/1.0")

    print(f"\nüéØ Key Achievements:")
    achievements = [
        "‚úÖ Automated entity linking and deduplication",
        "‚úÖ Normalized relationships for consistent graph structure",
        "‚úÖ Populated Neo4j knowledge graph with validation",
        "‚úÖ Quality assessment and graph analytics",
        "‚úÖ Multiple export formats for downstream use",
        "‚úÖ Scalable pipeline for large document collections"
    ]

    for achievement in achievements:
        print(f"   {achievement}")

    print(f"\nüöÄ Ready for Next Steps:")
    next_steps = [
        "‚Ä¢ Graph-Enhanced Retrieval (Notebook 12.4)",
        "‚Ä¢ Multi-hop reasoning implementation",
        "‚Ä¢ Graph traversal for question answering",
        "‚Ä¢ End-to-End Graph RAG System (Notebook 12.5)"
    ]

    for step in next_steps:
        print(f"   {step}")

    print("\nüí° Production Considerations:")
    considerations = [
        "‚Ä¢ Implement incremental graph updates for new documents",
        "‚Ä¢ Add graph versioning for reproducibility",
        "‚Ä¢ Scale Neo4j infrastructure for large datasets",
        "‚Ä¢ Implement graph backup and recovery procedures",
        "‚Ä¢ Monitor graph quality metrics over time"
    ]

    for consideration in considerations:
        print(f"   {consideration}")

# Run pipeline summary
summarize_graph_construction()

# Clean up connections
neo4j_constructor.close()

print("\nüéâ Graph Construction Complete!")
print("The knowledge graph is now ready for Graph RAG retrieval in the next notebook.")

## Key Takeaways

‚úÖ **Entity Deduplication**: Automated linking reduces redundancy and improves graph quality
‚úÖ **Scalable Construction**: Batch processing handles large document collections efficiently  
‚úÖ **Quality Validation**: Systematic assessment ensures reliable knowledge graphs
‚úÖ **Multiple Exports**: Flexible output formats for various downstream applications
‚úÖ **Production Ready**: Designed for real-world deployment with proper error handling

Continue to Notebook 12.4 to learn how to implement graph-enhanced retrieval using this constructed knowledge graph!