**# SETUP AND DEPENDENCIES**

In [None]:
# Install required packages
!pip install -q neo4j pandas numpy matplotlib networkx sentence-transformers scikit-learn faiss-cpu

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from typing import List, Dict, Any, Tuple
from collections import defaultdict, deque
import warnings
warnings.filterwarnings('ignore')

try:
    from sentence_transformers import SentenceTransformer
    import faiss
    from sklearn.metrics.pairwise import cosine_similarity
    print("‚úÖ All packages loaded successfully")
except ImportError as e:
    print(f"‚ö†Ô∏è Some packages may not be available: {e}")

print("üöÄ Setup complete! Ready for graph-enhanced retrieval.")

**# LOAD KNOWLEDGE GRAPH DATA**

In [None]:
def load_sample_knowledge_graph():
    """Load or create sample knowledge graph data."""
    try:
        with open('processed_knowledge_for_graph.json', 'r') as f:
            data = json.load(f)
        print("‚úÖ Loaded knowledge graph from previous notebook")
        return data
    except FileNotFoundError:
        print("‚ö†Ô∏è Creating sample data for demonstration...")
        return create_sample_graph_data()

def create_sample_graph_data():
    """Create sample knowledge graph data."""
    sample_data = {
        'entities': {
            'concept_0': {'id': 'concept_0', 'text': 'Transformer', 'type': 'CONCEPT'},
            'concept_1': {'id': 'concept_1', 'text': 'attention mechanisms', 'type': 'CONCEPT'},
            'concept_2': {'id': 'concept_2', 'text': 'BERT', 'type': 'CONCEPT'},
            'concept_3': {'id': 'concept_3', 'text': 'machine translation', 'type': 'CONCEPT'},
            'person_0': {'id': 'person_0', 'text': 'Ashish Vaswani', 'type': 'PERSON'},
            'person_1': {'id': 'person_1', 'text': 'Jacob Devlin', 'type': 'PERSON'},
            'metric_0': {'id': 'metric_0', 'text': 'BLEU', 'type': 'METRIC'},
            'dataset_0': {'id': 'dataset_0', 'text': 'WMT 2014', 'type': 'DATASET'}
        },
        'relationships': [
            {'source': 'concept_0', 'target': 'concept_1', 'type': 'BASED_ON', 'confidence': 0.9},
            {'source': 'concept_2', 'target': 'concept_0', 'type': 'BASED_ON', 'confidence': 0.85},
            {'source': 'concept_0', 'target': 'concept_3', 'type': 'EVALUATES_ON', 'confidence': 0.8},
            {'source': 'concept_0', 'target': 'metric_0', 'type': 'ACHIEVES', 'confidence': 0.9},
            {'source': 'person_0', 'target': 'concept_0', 'type': 'INTRODUCED', 'confidence': 1.0},
            {'source': 'person_1', 'target': 'concept_2', 'type': 'INTRODUCED', 'confidence': 1.0}
        ],
        'documents': {
            'paper_1': {
                'id': 'paper_1',
                'title': 'Attention Is All You Need',
                'content': 'We propose a new simple network architecture, the Transformer, based solely on attention mechanisms...',
                'entities': ['concept_0', 'concept_1', 'person_0', 'metric_0']
            },
            'paper_2': {
                'id': 'paper_2',
                'title': 'BERT: Pre-training of Deep Bidirectional Transformers',
                'content': 'We introduce BERT, which stands for Bidirectional Encoder Representations from Transformers...',
                'entities': ['concept_2', 'concept_0', 'person_1']
            }
        }
    }
    return sample_data

# Load the knowledge graph data
kg_data = load_sample_knowledge_graph()
print(f"üìä Knowledge Graph: {len(kg_data.get('entities', {}))} entities, {len(kg_data.get('relationships', []))} relationships")


**# PART 1: GRAPH TRAVERSAL RETRIEVER**

In [None]:
class GraphTraversalRetriever:
    """Implement graph traversal techniques for information retrieval."""

    def __init__(self, kg_data: Dict):
        self.kg_data = kg_data
        self.entities = kg_data.get('entities', {})
        self.relationships = kg_data.get('relationships', [])
        self.build_graph_structure()

    def build_graph_structure(self):
        """Build internal graph representation for traversal."""
        self.adjacency_list = defaultdict(list)
        self.reverse_adjacency = defaultdict(list)

        for rel in self.relationships:
            source = rel['source']
            target = rel['target']
            rel_type = rel['type']
            confidence = rel.get('confidence', 0.5)

            # Forward edges
            self.adjacency_list[source].append({
                'target': target,
                'relationship': rel_type,
                'confidence': confidence
            })

            # Reverse edges for bidirectional traversal
            self.reverse_adjacency[target].append({
                'source': source,
                'relationship': rel_type,
                'confidence': confidence
            })

    def find_entity_by_text(self, text: str, threshold: float = 0.8) -> List[str]:
        """Find entities matching the given text."""
        matches = []
        text_lower = text.lower()

        for entity_id, entity_data in self.entities.items():
            entity_text = entity_data['text'].lower()

            if text_lower == entity_text:
                matches.append((entity_id, 1.0))
            elif text_lower in entity_text or entity_text in text_lower:
                matches.append((entity_id, 0.9))
            elif any(word in entity_text for word in text_lower.split()):
                matches.append((entity_id, 0.7))

        matches = [(eid, score) for eid, score in matches if score >= threshold]
        matches.sort(key=lambda x: x[1], reverse=True)
        return [eid for eid, score in matches]

    def get_direct_neighbors(self, entity_id: str, max_neighbors: int = 10) -> Dict[str, Any]:
        """Get direct neighbors of an entity."""
        results = []

        # Forward neighbors
        for neighbor_info in self.adjacency_list.get(entity_id, []):
            neighbor_id = neighbor_info['target']
            if neighbor_id in self.entities:
                neighbor_data = self.entities[neighbor_id]
                results.append({
                    'neighbor_id': neighbor_id,
                    'neighbor_text': neighbor_data['text'],
                    'neighbor_type': neighbor_data['type'],
                    'relationship': neighbor_info['relationship'],
                    'confidence': neighbor_info['confidence']
                })

        # Reverse neighbors
        for neighbor_info in self.reverse_adjacency.get(entity_id, []):
            neighbor_id = neighbor_info['source']
            if neighbor_id in self.entities:
                neighbor_data = self.entities[neighbor_id]
                results.append({
                    'neighbor_id': neighbor_id,
                    'neighbor_text': neighbor_data['text'],
                    'neighbor_type': neighbor_data['type'],
                    'relationship': neighbor_info['relationship'],
                    'confidence': neighbor_info['confidence']
                })

        return {
            'entity_id': entity_id,
            'entity_text': self.entities.get(entity_id, {}).get('text', 'Unknown'),
            'neighbors': results[:max_neighbors]
        }

    def find_multi_hop_paths(self, start_entity: str, end_entity: str, max_hops: int = 3) -> List[Dict]:
        """Find paths between entities using BFS with confidence scoring."""
        if start_entity == end_entity:
            return [{
                'node_path': [self.entities[start_entity]['text']],
                'rel_path': [],
                'path_confidence': 1.0,
                'path_length': 0
            }]

        queue = deque([(start_entity, [start_entity], [], 1.0)])
        visited = set()
        paths = []

        while queue and len(paths) < 10:
            current, path, relations, confidence = queue.popleft()

            if len(path) > max_hops + 1:
                continue

            if current == end_entity and len(path) > 1:
                node_path = [self.entities[node_id]['text'] for node_id in path]
                paths.append({
                    'node_path': node_path,
                    'rel_path': relations,
                    'path_confidence': confidence,
                    'path_length': len(path) - 1
                })
                continue

            path_key = tuple(path)
            if path_key in visited:
                continue
            visited.add(path_key)

            # Explore neighbors
            for neighbor_info in self.adjacency_list.get(current, []):
                neighbor = neighbor_info['target']
                if neighbor not in path:  # Avoid cycles
                    new_confidence = confidence * neighbor_info['confidence']
                    new_relations = relations + [neighbor_info['relationship']]
                    queue.append((neighbor, path + [neighbor], new_relations, new_confidence))

        paths.sort(key=lambda x: (-x['path_confidence'], x['path_length']))
        return paths

    def get_entity_subgraph(self, entity_ids: List[str], max_depth: int = 2) -> Dict[str, Any]:
        """Extract a subgraph around given entities."""
        subgraph_entities = set(entity_ids)
        subgraph_relationships = []

        # BFS to expand subgraph
        current_level = set(entity_ids)

        for depth in range(max_depth):
            next_level = set()

            for entity_id in current_level:
                # Add forward neighbors
                for neighbor_info in self.adjacency_list.get(entity_id, []):
                    neighbor_id = neighbor_info['target']
                    next_level.add(neighbor_id)
                    subgraph_entities.add(neighbor_id)

                    if entity_id in subgraph_entities and neighbor_id in subgraph_entities:
                        subgraph_relationships.append({
                            'source': entity_id,
                            'target': neighbor_id,
                            'relationship': neighbor_info['relationship'],
                            'confidence': neighbor_info['confidence']
                        })

                # Add reverse neighbors
                for neighbor_info in self.reverse_adjacency.get(entity_id, []):
                    neighbor_id = neighbor_info['source']
                    next_level.add(neighbor_id)
                    subgraph_entities.add(neighbor_id)

            current_level = next_level - subgraph_entities

        return {
            'entities': {eid: self.entities[eid] for eid in subgraph_entities if eid in self.entities},
            'relationships': subgraph_relationships,
            'stats': {
                'entity_count': len(subgraph_entities),
                'relationship_count': len(subgraph_relationships),
                'max_depth': max_depth
            }
        }

# Initialize graph traversal retriever
traversal_retriever = GraphTraversalRetriever(kg_data)
print("‚úÖ Graph traversal retriever initialized")

**# PART 2: ENTITY EMBEDDINGS**

In [None]:
class GraphEntityEmbedder:
    """Create and manage embeddings for graph entities."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        try:
            self.embedding_model = SentenceTransformer(model_name)
            self.model_loaded = True
            print(f"‚úÖ Embedding model loaded: {model_name}")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not load embedding model: {e}")
            print("Will use mock embeddings for demonstration")
            self.model_loaded = False

        self.entity_embeddings = {}
        self.embedding_index = None

    def create_entity_embeddings(self, entities: Dict[str, Dict]) -> Dict[str, np.ndarray]:
        """Create embeddings for all entities."""
        if not self.model_loaded:
            return self._create_mock_embeddings(entities)

        print(f"üîÑ Creating embeddings for {len(entities)} entities...")

        embeddings = {}
        texts_to_embed = []
        entity_ids = []

        for entity_id, entity_data in entities.items():
            entity_text = entity_data['text']
            entity_type = entity_data['type']
            enhanced_text = f"{entity_type}: {entity_text}"
            texts_to_embed.append(enhanced_text)
            entity_ids.append(entity_id)

        try:
            embedding_vectors = self.embedding_model.encode(texts_to_embed, convert_to_numpy=True)
            for i, entity_id in enumerate(entity_ids):
                embeddings[entity_id] = embedding_vectors[i]
            print(f"‚úÖ Created embeddings with dimension {embedding_vectors.shape[1]}")
        except Exception as e:
            print(f"‚ùå Error creating embeddings: {e}")
            return self._create_mock_embeddings(entities)

        self.entity_embeddings = embeddings
        return embeddings

    def _create_mock_embeddings(self, entities: Dict[str, Dict]) -> Dict[str, np.ndarray]:
        """Create mock embeddings for demonstration."""
        embeddings = {}
        dimension = 384
        np.random.seed(42)

        for entity_id, entity_data in entities.items():
            text_hash = hash(entity_data['text']) % 1000
            np.random.seed(text_hash)
            embedding = np.random.normal(0, 1, dimension)
            embedding = embedding / np.linalg.norm(embedding)
            embeddings[entity_id] = embedding

        self.entity_embeddings = embeddings
        print(f"‚úÖ Created {len(embeddings)} mock embeddings with dimension {dimension}")
        return embeddings

    def build_faiss_index(self) -> bool:
        """Build FAISS index for efficient similarity search."""
        if not self.entity_embeddings:
            print("‚ùå No embeddings available to index")
            return False

        try:
            entity_ids = list(self.entity_embeddings.keys())
            embedding_matrix = np.vstack([self.entity_embeddings[eid] for eid in entity_ids])

            dimension = embedding_matrix.shape[1]
            self.embedding_index = faiss.IndexFlatIP(dimension)

            faiss.normalize_L2(embedding_matrix)
            self.embedding_index.add(embedding_matrix)

            self.entity_id_to_index = {entity_id: i for i, entity_id in enumerate(entity_ids)}
            self.index_to_entity_id = {i: entity_id for i, entity_id in enumerate(entity_ids)}

            print(f"‚úÖ FAISS index built with {len(entity_ids)} entities")
            return True
        except Exception as e:
            print(f"‚ùå Error building FAISS index: {e}")
            return False

    def semantic_search(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Perform semantic search over entity embeddings."""
        if not self.embedding_index:
            if not self.build_faiss_index():
                return []

        try:
            if self.model_loaded:
                query_embedding = self.embedding_model.encode([query_text], convert_to_numpy=True)
            else:
                query_hash = hash(query_text) % 1000
                np.random.seed(query_hash)
                query_embedding = np.random.normal(0, 1, (1, 384))
                query_embedding = query_embedding / np.linalg.norm(query_embedding)

            faiss.normalize_L2(query_embedding)
            scores, indices = self.embedding_index.search(query_embedding, top_k)

            results = []
            for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
                if idx in self.index_to_entity_id:
                    entity_id = self.index_to_entity_id[idx]
                    results.append({
                        'entity_id': entity_id,
                        'similarity_score': float(score),
                        'rank': i + 1
                    })

            return results
        except Exception as e:
            print(f"‚ùå Error in semantic search: {e}")
            return []

# Initialize entity embedder
entity_embedder = GraphEntityEmbedder()
embeddings = entity_embedder.create_entity_embeddings(kg_data['entities'])
entity_embedder.build_faiss_index()
print("‚úÖ Entity embeddings ready")

**# PART 3: HYBRID GRAPH-VECTOR RETRIEVAL**

In [None]:
class HybridGraphVectorRetriever:
    """Combine graph structure with semantic search for enhanced retrieval."""

    def __init__(self, graph_retriever: GraphTraversalRetriever, embedder: GraphEntityEmbedder):
        self.graph_retriever = graph_retriever
        self.embedder = embedder
        self.entities = graph_retriever.entities
        self.documents = graph_retriever.kg_data.get('documents', {})

    def hybrid_search(self, query: str, top_k: int = 10, graph_weight: float = 0.5) -> List[Dict[str, Any]]:
        """Perform hybrid search combining semantic similarity and graph structure."""
        # Step 1: Semantic search
        semantic_results = self.embedder.semantic_search(query, top_k=top_k*2)

        # Step 2: Expand with graph neighbors
        graph_expanded_entities = set()
        semantic_scores = {}

        for result in semantic_results:
            entity_id = result['entity_id']
            semantic_score = result['similarity_score']
            semantic_scores[entity_id] = semantic_score
            graph_expanded_entities.add(entity_id)

            # Add neighbors
            neighbors = self.graph_retriever.get_direct_neighbors(entity_id)
            for neighbor in neighbors['neighbors']:
                neighbor_id = neighbor['neighbor_id']
                neighbor_score = semantic_score * neighbor['confidence'] * 0.7
                if neighbor_id not in semantic_scores or semantic_scores[neighbor_id] < neighbor_score:
                    semantic_scores[neighbor_id] = neighbor_score
                graph_expanded_entities.add(neighbor_id)

        # Step 3: Calculate combined scores
        hybrid_results = []
        for entity_id in graph_expanded_entities:
            entity_data = self.entities.get(entity_id, {})
            semantic_score = semantic_scores.get(entity_id, 0.0)

            # Calculate graph centrality score
            neighbors = self.graph_retriever.get_direct_neighbors(entity_id)
            centrality_score = min(len(neighbors['neighbors']) / 10.0, 1.0)

            # Combine scores
            hybrid_score = (graph_weight * centrality_score) + ((1 - graph_weight) * semantic_score)

            hybrid_results.append({
                'entity_id': entity_id,
                'entity_text': entity_data.get('text', 'Unknown'),
                'entity_type': entity_data.get('type', 'Unknown'),
                'semantic_score': semantic_score,
                'centrality_score': centrality_score,
                'hybrid_score': hybrid_score
            })

        hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
        return hybrid_results[:top_k]

    def retrieve_context_with_paths(self, query: str, max_entities: int = 5) -> Dict[str, Any]:
        """Retrieve context including reasoning paths between entities."""
        # Get top entities from hybrid search
        top_entities = self.hybrid_search(query, top_k=max_entities)

        if len(top_entities) < 2:
            return {
                'entities': top_entities,
                'reasoning_paths': [],
                'documents': [],
                'subgraph': {}
            }

        # Find reasoning paths between top entities
        reasoning_paths = []
        entity_ids = [e['entity_id'] for e in top_entities[:3]]

        for i, start_id in enumerate(entity_ids):
            for end_id in entity_ids[i+1:]:
                paths = self.graph_retriever.find_multi_hop_paths(start_id, end_id, max_hops=2)
                if paths:
                    reasoning_paths.extend(paths[:2])

        # Extract relevant subgraph
        subgraph = self.graph_retriever.get_entity_subgraph(entity_ids, max_depth=1)

        # Find relevant documents
        relevant_docs = []
        for entity in top_entities:
            entity_id = entity['entity_id']
            for doc_id, doc_data in self.documents.items():
                if entity_id in doc_data.get('entities', []):
                    relevant_docs.append({
                        'document_id': doc_id,
                        'title': doc_data['title'],
                        'content': doc_data['content'][:200] + "...",
                        'matching_entity': entity['entity_text']
                    })

        return {
            'entities': top_entities,
            'reasoning_paths': reasoning_paths,
            'documents': relevant_docs,
            'subgraph': subgraph
        }

# Initialize hybrid retriever
hybrid_retriever = HybridGraphVectorRetriever(traversal_retriever, entity_embedder)
print("‚úÖ Hybrid graph-vector retriever initialized")

**# PART 4: QUERY PROCESSING PIPELINE**

In [None]:
class GraphRAGQueryProcessor:
    """Complete query processing pipeline for Graph RAG."""

    def __init__(self, hybrid_retriever: HybridGraphVectorRetriever):
        self.hybrid_retriever = hybrid_retriever

    def process_query(self, query: str, max_context_entities: int = 5) -> Dict[str, Any]:
        """Process a natural language query and return structured context."""
        print(f"üîç Processing query: '{query}'")

        # Retrieve relevant context
        context = self.hybrid_retriever.retrieve_context_with_paths(query, max_context_entities)

        # Build context summary
        context_summary = self._build_context_summary(context)

        # Prepare for LLM generation
        formatted_context = self._format_context_for_llm(context, query)

        return {
            'original_query': query,
            'retrieved_entities': context['entities'],
            'reasoning_paths': context['reasoning_paths'],
            'relevant_documents': context['documents'],
            'context_summary': context_summary,
            'formatted_context': formatted_context
        }

    def _build_context_summary(self, context: Dict) -> str:
        """Build a human-readable context summary."""
        summary_parts = []

        if context['entities']:
            entity_texts = [e['entity_text'] for e in context['entities'][:3]]
            summary_parts.append(f"Key entities: {', '.join(entity_texts)}")

        if context['reasoning_paths']:
            path_count = len(context['reasoning_paths'])
            summary_parts.append(f"Found {path_count} reasoning paths connecting entities")

        if context['documents']:
            doc_count = len(context['documents'])
            summary_parts.append(f"Referenced in {doc_count} documents")

        return ". ".join(summary_parts) + "." if summary_parts else "No relevant context found."

    def _format_context_for_llm(self, context: Dict, query: str) -> str:
        """Format retrieved context for LLM consumption."""
        formatted_parts = []

        if context['entities']:
            formatted_parts.append("=== RELEVANT ENTITIES ===")
            for entity in context['entities'][:5]:
                formatted_parts.append(f"- {entity['entity_text']} ({entity['entity_type']})")
                formatted_parts.append(f"  Relevance score: {entity['hybrid_score']:.3f}")

        if context['reasoning_paths']:
            formatted_parts.append("\n=== REASONING PATHS ===")
            for i, path in enumerate(context['reasoning_paths'][:3], 1):
                formatted_parts.append(f"{i}. {' -> '.join(path['node_path'])}")
                formatted_parts.append(f"   Relationships: {' -> '.join(path['rel_path'])}")
                formatted_parts.append(f"   Confidence: {path['path_confidence']:.3f}")

        if context['documents']:
            formatted_parts.append("\n=== RELEVANT DOCUMENTS ===")
            for doc in context['documents'][:3]:
                formatted_parts.append(f"Title: {doc['title']}")
                formatted_parts.append(f"Content: {doc['content']}")
                formatted_parts.append(f"Relevant entity: {doc['matching_entity']}")
                formatted_parts.append("")

        return "\n".join(formatted_parts)

# Initialize query processor
query_processor = GraphRAGQueryProcessor(hybrid_retriever)
print("‚úÖ Graph RAG query processor initialized")

**# DEMONSTRATIONS**

In [7]:
def demonstrate_graph_traversal():
    """Demonstrate graph traversal capabilities."""
    print("\n=== GRAPH TRAVERSAL DEMONSTRATIONS ===")

    # 1. Entity lookup and direct neighbors
    print("\n1. Finding entities and their direct neighbors:")
    search_terms = ["Transformer", "attention", "BERT"]

    for term in search_terms:
        print(f"\n   Searching for: '{term}'")
        entity_ids = traversal_retriever.find_entity_by_text(term)

        if entity_ids:
            entity_id = entity_ids[0]
            neighbors = traversal_retriever.get_direct_neighbors(entity_id, max_neighbors=5)

            print(f"   Found entity: {neighbors['entity_text']} (ID: {entity_id})")
            print(f"   Direct neighbors:")

            for neighbor in neighbors['neighbors']:
                print(f"     ‚Ä¢ {neighbor['neighbor_text']} via {neighbor['relationship']} (conf: {neighbor['confidence']:.2f})")
        else:
            print(f"   No entities found for '{term}'")

    # 2. Multi-hop path finding
    print("\n\n2. Multi-hop reasoning paths:")
    path_queries = [
        ("concept_2", "concept_1", "BERT to attention mechanisms"),
        ("person_0", "metric_0", "Ashish Vaswani to BLEU metric"),
    ]

    for start_id, end_id, description in path_queries:
        print(f"\n   Path query: {description}")
        paths = traversal_retriever.find_multi_hop_paths(start_id, end_id, max_hops=3)

        if paths:
            for i, path in enumerate(paths[:2], 1):
                print(f"     Path {i}: {' -> '.join(path['node_path'])}")
                print(f"       Relations: {' -> '.join(path['rel_path'])}")
                print(f"       Confidence: {path['path_confidence']:.3f}, Length: {path['path_length']}")
        else:
            print(f"     No paths found")

def demonstrate_hybrid_retrieval():
    """Demonstrate hybrid graph-vector retrieval."""
    print("\n=== HYBRID GRAPH-VECTOR RETRIEVAL DEMONSTRATIONS ===")

    test_queries = [
        "neural network architecture for language understanding",
        "attention mechanism evaluation metrics",
        "transformer model performance"
    ]

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        print("-" * 50)

        # Hybrid search
        results = hybrid_retriever.hybrid_search(query, top_k=5)

        print("Top entities (hybrid search):")
        for i, result in enumerate(results, 1):
            print(f"  {i}. {result['entity_text']} ({result['entity_type']})")
            print(f"     Semantic: {result['semantic_score']:.3f}, Centrality: {result['centrality_score']:.3f}, Hybrid: {result['hybrid_score']:.3f}")

        # Context retrieval with paths
        context = hybrid_retriever.retrieve_context_with_paths(query, max_entities=3)

        if context['reasoning_paths']:
            print("\nReasoning paths:")
            for i, path in enumerate(context['reasoning_paths'][:2], 1):
                print(f"  Path {i}: {' -> '.join(path['node_path'])}")
                print(f"    Relations: {' -> '.join(path['rel_path'])}")

        if context['documents']:
            print(f"\nRelevant documents:")
            for doc in context['documents'][:2]:
                print(f"  ‚Ä¢ {doc['title']} (via {doc['matching_entity']})")

def demonstrate_query_processing():
    """Demonstrate complete query processing pipeline."""
    print("\n=== COMPLETE QUERY PROCESSING DEMONSTRATIONS ===")

    complex_queries = [
        "How does the Transformer architecture relate to BERT?",
        "What evaluation metrics are used for machine translation models?",
        "Who introduced the attention mechanism and how is it used?"
    ]

    for query in complex_queries:
        print(f"\nProcessing: '{query}'")
        print("=" * 60)

        result = query_processor.process_query(query, max_context_entities=4)

        print(f"Context Summary: {result['context_summary']}")
        print()

        print("Retrieved Entities:")
        for entity in result['retrieved_entities'][:3]:
            print(f"  ‚Ä¢ {entity['entity_text']} ({entity['entity_type']}) - Score: {entity['hybrid_score']:.3f}")
        print()

        if result['reasoning_paths']:
            print("Key Reasoning Paths:")
            for i, path in enumerate(result['reasoning_paths'][:2], 1):
                print(f"  {i}. {' ‚Üí '.join(path['node_path'])}")
                print(f"     Via: {' ‚Üí '.join(path['rel_path'])}")
        print()

        if result['relevant_documents']:
            print("Relevant Documents:")
            for doc in result['relevant_documents'][:2]:
                print(f"  ‚Ä¢ {doc['title']} (mentions {doc['matching_entity']})")

**# PERFORMANCE ANALYSIS**

In [8]:
def analyze_retrieval_performance():
    """Analyze performance of different retrieval approaches."""
    print("\n=== RETRIEVAL PERFORMANCE ANALYSIS ===")

    test_queries = [
        "transformer attention mechanism",
        "BERT language model",
        "machine translation evaluation",
        "neural network architecture",
        "deep learning research"
    ]

    performance_results = []

    for query in test_queries:
        print(f"Analyzing query: '{query}'")

        # Semantic-only search
        semantic_results = entity_embedder.semantic_search(query, top_k=5)
        semantic_entities = [r['entity_id'] for r in semantic_results]

        # Graph-only expansion
        graph_entities = set()
        if semantic_entities:
            top_entity = semantic_entities[0]
            neighbors = traversal_retriever.get_direct_neighbors(top_entity)
            graph_entities = {n['neighbor_id'] for n in neighbors['neighbors']}
            graph_entities.add(top_entity)

        # Hybrid approach
        hybrid_results = hybrid_retriever.hybrid_search(query, top_k=5)
        hybrid_entities = [r['entity_id'] for r in hybrid_results]

        # Calculate coverage
        all_entities = set(semantic_entities + list(graph_entities) + hybrid_entities)

        performance_results.append({
            'query': query,
            'semantic_count': len(semantic_entities),
            'graph_count': len(graph_entities),
            'hybrid_count': len(hybrid_entities),
            'total_unique': len(all_entities)
        })

    # Display performance summary
    print("\nPerformance Summary:")
    print("-" * 50)

    df = pd.DataFrame(performance_results)

    print(f"Average entities retrieved:")
    print(f"  Semantic-only: {df['semantic_count'].mean():.1f}")
    print(f"  Graph-expanded: {df['graph_count'].mean():.1f}")
    print(f"  Hybrid approach: {df['hybrid_count'].mean():.1f}")
    print(f"  Total unique coverage: {df['total_unique'].mean():.1f}")

**# VISUALIZATION**

In [9]:
def visualize_retrieval_results(query: str):
    """Visualize the retrieval results for a given query."""
    print(f"\n=== VISUALIZING RETRIEVAL FOR: '{query}' ===")

    # Get retrieval results
    context = hybrid_retriever.retrieve_context_with_paths(query, max_entities=5)

    # Create NetworkX graph for visualization
    G = nx.Graph()

    # Add nodes
    node_colors = []
    node_sizes = []

    for entity in context['entities']:
        entity_id = entity['entity_id']
        entity_text = entity['entity_text']
        entity_type = entity['entity_type']

        G.add_node(entity_id,
                  text=entity_text,
                  type=entity_type,
                  score=entity['hybrid_score'])

        # Color by entity type
        if entity_type == 'CONCEPT':
            node_colors.append('lightblue')
        elif entity_type == 'PERSON':
            node_colors.append('lightgreen')
        elif entity_type == 'METRIC':
            node_colors.append('orange')
        elif entity_type == 'DATASET':
            node_colors.append('pink')
        else:
            node_colors.append('lightgray')

        # Size by relevance score
        node_sizes.append(300 + entity['hybrid_score'] * 1000)

    # Add edges from subgraph
    if 'subgraph' in context and context['subgraph']:
        for rel in context['subgraph'].get('relationships', []):
            source_id = rel['source']
            target_id = rel['target']

            if source_id in G.nodes and target_id in G.nodes:
                G.add_edge(source_id, target_id,
                          relationship=rel['relationship'],
                          confidence=rel['confidence'])

    # Create visualization
    plt.figure(figsize=(12, 8))

    # Use spring layout for positioning
    pos = nx.spring_layout(G, k=2, iterations=50)

    # Draw the graph
    nx.draw(G, pos,
            node_color=node_colors,
            node_size=node_sizes,
            font_size=8,
            font_weight='bold',
            edge_color='gray',
            width=2,
            alpha=0.7,
            with_labels=False)

    # Add custom labels
    labels = {}
    for node_id in G.nodes():
        node_data = G.nodes[node_id]
        labels[node_id] = node_data['text'][:15] + ("..." if len(node_data['text']) > 15 else "")

    nx.draw_networkx_labels(G, pos, labels, font_size=8)

    # Add title and legend
    plt.title(f"Graph Retrieval Results for: '{query}'", size=14, weight='bold')

    # Create legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='lightblue', label='Concepts'),
        Patch(facecolor='lightgreen', label='People'),
        Patch(facecolor='orange', label='Metrics'),
        Patch(facecolor='pink', label='Datasets')
    ]
    plt.legend(handles=legend_elements, loc='upper right')

    plt.axis('off')
    plt.tight_layout()
    plt.show()

    # Print detailed results
    print(f"\nDetailed Results:")
    print(f"Retrieved {len(context['entities'])} entities")
    print(f"Found {len(context['reasoning_paths'])} reasoning paths")
    print(f"Connected to {len(context['documents'])} documents")

**# EXPORT AND INTEGRATION**

In [10]:
def export_retrieval_results(output_file: str = "graph_retrieval_results.json"):
    """Export retrieval capabilities and results for integration."""
    print(f"üì§ Exporting retrieval results to {output_file}")

    test_queries = [
        "transformer neural network architecture",
        "BERT language understanding model",
        "attention mechanism evaluation metrics"
    ]

    export_data = {
        'retrieval_capabilities': {
            'graph_traversal': True,
            'semantic_search': True,
            'hybrid_retrieval': True,
            'multi_hop_reasoning': True,
            'subgraph_extraction': True
        },
        'knowledge_graph_stats': {
            'total_entities': len(kg_data['entities']),
            'total_relationships': len(kg_data['relationships']),
            'entity_types': list(set(e['type'] for e in kg_data['entities'].values())),
            'relationship_types': list(set(r['type'] for r in kg_data['relationships']))
        },
        'sample_queries': {},
        'performance_metrics': {
            'avg_entities_per_query': 0,
            'avg_paths_per_query': 0,
            'avg_documents_per_query': 0
        }
    }

    total_entities = 0
    total_paths = 0
    total_docs = 0

    for query in test_queries:
        print(f"   Processing: {query}")

        result = query_processor.process_query(query, max_context_entities=5)

        export_data['sample_queries'][query] = {
            'retrieved_entities': len(result['retrieved_entities']),
            'reasoning_paths': len(result['reasoning_paths']),
            'relevant_documents': len(result['relevant_documents']),
            'context_summary': result['context_summary'],
            'top_entities': [
                {
                    'text': e['entity_text'],
                    'type': e['entity_type'],
                    'score': e['hybrid_score']
                } for e in result['retrieved_entities'][:3]
            ]
        }

        total_entities += len(result['retrieved_entities'])
        total_paths += len(result['reasoning_paths'])
        total_docs += len(result['relevant_documents'])

    # Calculate averages
    num_queries = len(test_queries)
    export_data['performance_metrics'] = {
        'avg_entities_per_query': total_entities / num_queries,
        'avg_paths_per_query': total_paths / num_queries,
        'avg_documents_per_query': total_docs / num_queries
    }

    # Save to file
    with open(output_file, 'w') as f:
        json.dump(export_data, f, indent=2, default=str)

    print(f"‚úÖ Exported retrieval results and capabilities")
    print(f"   Queries processed: {num_queries}")
    print(f"   Average entities per query: {export_data['performance_metrics']['avg_entities_per_query']:.1f}")
    print(f"   Average reasoning paths per query: {export_data['performance_metrics']['avg_paths_per_query']:.1f}")


**# RUN ALL DEMONSTRATIONS**

In [None]:
print("\n" + "="*80)
print("üöÄ RUNNING ALL DEMONSTRATIONS")
print("="*80)

# Run demonstrations
demonstrate_graph_traversal()
demonstrate_hybrid_retrieval()
demonstrate_query_processing()
analyze_retrieval_performance()

# Visualize results for a sample query
visualize_retrieval_results("transformer attention mechanism evaluation")

# Export results
export_retrieval_results()

**# SUMMARY AND NEXT STEPS**

In [None]:
def chapter_summary():
    """Summarize what we've accomplished in this notebook."""
    print("\n" + "="*60)
    print("üìã CHAPTER 12.4 SUMMARY - Graph-Enhanced Retrieval")
    print("="*60)

    print("\n‚úÖ What you've implemented:")
    accomplishments = [
        "Graph traversal for multi-hop reasoning",
        "Entity-based semantic search with embeddings",
        "Hybrid graph-vector retrieval combining structure and semantics",
        "Subgraph extraction for focused context",
        "Complete query processing pipeline",
        "Performance analysis and optimization techniques",
        "Visualization of retrieval results"
    ]

    for item in accomplishments:
        print(f"   ‚Ä¢ {item}")

    print(f"\nüìä Key Metrics:")
    print(f"   ‚Ä¢ Knowledge graph entities: {len(kg_data['entities'])}")
    print(f"   ‚Ä¢ Knowledge graph relationships: {len(kg_data['relationships'])}")
    print(f"   ‚Ä¢ Entity embedding dimension: 384")
    print(f"   ‚Ä¢ Max reasoning hops: 3")

    print(f"\nüéØ Key Capabilities Demonstrated:")
    capabilities = [
        "Multi-hop path finding between entities",
        "Semantic similarity search over graph entities",
        "Hybrid scoring combining graph structure and semantics",
        "Context-aware subgraph extraction",
        "Reasoning path explanation and visualization"
    ]

    for capability in capabilities:
        print(f"   ‚Ä¢ {capability}")

    print(f"\nüöÄ Ready for Next Steps:")
    next_steps = [
        "Integration with LLMs for natural language generation (Notebook 12.5)",
        "End-to-end Graph RAG system implementation",
        "Advanced reasoning patterns and query optimization",
        "Production deployment and scaling considerations"
    ]

    for step in next_steps:
        print(f"   ‚Ä¢ {step}")

    print(f"\nüí° Key Insights:")
    insights = [
        "Graph structure provides explainable reasoning paths",
        "Hybrid retrieval outperforms single-method approaches",
        "Entity embeddings enable semantic understanding of graph content",
        "Multi-hop reasoning reveals non-obvious connections",
        "Visualization helps debug and understand retrieval quality"
    ]

    for insight in insights:
        print(f"   ‚Ä¢ {insight}")

def prepare_for_next_notebook():
    """Prepare data and components for the next notebook."""
    print("\nüìã PREPARING FOR NOTEBOOK 12.5")
    print("="*40)

    # Save key components for next notebook
    graph_rag_components = {
        'traversal_retriever': 'GraphTraversalRetriever initialized',
        'entity_embedder': 'GraphEntityEmbedder with FAISS index',
        'hybrid_retriever': 'HybridGraphVectorRetriever ready',
        'query_processor': 'GraphRAGQueryProcessor configured',
        'knowledge_graph': f"{len(kg_data['entities'])} entities, {len(kg_data['relationships'])} relationships"
    }

    print("‚úÖ Components ready for Graph RAG integration:")
    for component, status in graph_rag_components.items():
        print(f"   ‚Ä¢ {component}: {status}")

    # Save configuration for next notebook
    config = {
        'embedding_model': 'all-MiniLM-L6-v2',
        'max_hops': 3,
        'max_entities': 5,
        'graph_weight': 0.5,
        'top_k_default': 10
    }

    with open('graph_retrieval_config.json', 'w') as f:
        json.dump(config, f, indent=2)

    print(f"\nüíæ Configuration saved to 'graph_retrieval_config.json'")
    print(f"üîÑ All components initialized and ready for Graph RAG system integration")

# Run final summary and preparation
chapter_summary()
prepare_for_next_notebook()


**# TROUBLESHOOTING GUIDE**

In [None]:
print("\n" + "="*60)
print("üîß TROUBLESHOOTING GUIDE")
print("="*60)

troubleshooting_tips = [
    {
        "issue": "FAISS installation errors",
        "solution": "Use 'pip install faiss-cpu' instead of 'faiss-gpu' in Colab"
    },
    {
        "issue": "Sentence Transformers slow loading",
        "solution": "First run may be slow due to model download, subsequent runs faster"
    },
    {
        "issue": "Memory issues with large graphs",
        "solution": "Reduce max_entities, max_hops, or process in smaller batches"
    },
    {
        "issue": "Low retrieval quality",
        "solution": "Adjust graph_weight parameter, try different embedding models"
    },
    {
        "issue": "Visualization not displaying",
        "solution": "Ensure matplotlib is installed and try plt.show() explicitly"
    }
]

for tip in troubleshooting_tips:
    print(f"‚ùå {tip['issue']}")
    print(f"‚úÖ {tip['solution']}\n")

print("üìö For more help, refer to the chapter text and previous notebooks.")
print("üéØ Ready to proceed to Notebook 12.5: Complete Graph RAG System!")

print(f"\nüéâ Graph-Enhanced Retrieval Implementation Complete!")
print(f"Continue to Notebook 12.5 to build the complete Graph RAG system.")