In [None]:
from PubMedDownloader import PubMedEntrezDownloader
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

async def main():
    downloader = PubMedEntrezDownloader("olandechris@gmail.com")
    pmids = await downloader.search_pubmed("", max_results=10000)
    articles = await downloader.fetch_article_details(pmids)
    downloader.save_to_json(articles, "results.json")

await main()

In [None]:
from document_processor import DocumentProcessor
docs = DocumentProcessor()
docs.get_stats(documents)

In [None]:
import asyncio
from tqdm.asyncio import tqdm_asyncio
import logging
from batchprocessor import PMCBatchProcessor
from document_processor import DocumentProcessor

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("MainScript")

async def main():
    doc_processor = DocumentProcessor(embeddings_model="BAAI/bge-small-en-v1.5")

    batch_processor = PMCBatchProcessor(
        document_processor=doc_processor,
        batch_size=96,
        max_concurrent_batches=3,
        retry_attempts=2,
        retry_delay=1.0,
        inter_batch_delay=0.1
    )

    file_path = "research20250605_002659.json" 
    output_directory = "output/processed_pmc_data"

    logger.info(f"Starting batch processing of {file_path}")

    try:
        processing_results = await batch_processor.process_pmc_file_async(
            file_path=file_path
        )

        batch_processor.save_results(processing_results, output_directory, save_batch_details=True)

        s = processing_results['processing_summary']
        logger.info(f"Processing complete: {s['total_documents']:,} docs → {s['total_chunks']:,} chunks ({s['processing_time']:.1f}s)")
        logger.info(f"Success rate: {s['success_rate']:.1f}%")

    except Exception as e:
        logger.error(f"An error occurred during batch processing: {e}", exc_info=True)


if __name__ == "__main__":
    await main()

In [None]:
import json
from pathlib import Path
from langchain.schema import Document

data_path = Path("../output/processed_pmc_data/pmc_chunks.json")
with data_path.open(encoding="utf-8") as f:
    data = json.load(f)

# Create documents
documents = [
    Document(page_content=doc["content"], metadata=doc["metadata"])
    for doc in data["documents"]
]

In [None]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

In [None]:
from src.nlp.vectorstore import VectorStore

vector_store = VectorStore()

In [None]:
# await vector_store._create_vector_index(documents)

In [None]:
from langchain_openai import ChatOpenAI
import os
llm = ChatOpenAI(model="meta-llama/llama-3.3-70b-instruct",
                                api_key=os.getenv("OPENROUTER_API_KEY"),
                                openai_api_base="https://openrouter.ai/api/v1",
                                temperature=0,
                                streaming=False)

In [None]:
from src.knowledge_graph.knowledge_graph import KnowledgeGraph
kg = KnowledgeGraph(
    cache_dir = "../my_cache",
    batch_size=100,  # Process 50 documents at a time with spaCy
    # max_concurrent_llm_calls=10
)

# Build the knowledge graph
graph = await kg.build_knowledge_graph(documents, llm)

In [None]:
from src.nlp.rag_chain import QueryEngine
engine = QueryEngine(vector_store = vector_store, knowledge_graph = kg, llm = llm)
# vector_store.retrieve_relevant_documents("What is the effect of Gaza war on Children?", filter_threshold = 0.5)

In [None]:
query = "what are the effects of the Gaza war on children?"

In [None]:

response, traversal_path, filtered_content = await engine.query(query)

In [None]:
from src.knowledge_graph.graph_viz import GraphVisualizer
visualizer = GraphVisualizer()

In [None]:
await visualizer.visualize_traversal_async(graph, traversal_path)

In [None]:
import networkx as nx
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
# from graph_viz import GraphVisualizer
import json
import random

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class NodeMetadata:
    """Enhanced metadata for each node."""
    processing_time_ms: float
    confidence_score: float
    error_rate: float
    throughput_capacity: int
    dependencies: List[int] = field(default_factory=list)
    node_type: str = "standard"

# 1. Create a comprehensive knowledge processing graph
logger.info("Building complex knowledge processing graph...")
G = nx.Graph()

# Define complex edge relationships with multiple attributes
edges = [
    # Primary knowledge flow path
    (1, 2, {'weight': 0.85, 'flow_type': 'primary', 'latency_ms': 15, 'bandwidth': 100}),
    (2, 3, {'weight': 0.72, 'flow_type': 'primary', 'latency_ms': 25, 'bandwidth': 95}),
    (3, 4, {'weight': 0.91, 'flow_type': 'primary', 'latency_ms': 35, 'bandwidth': 90}),
    (4, 8, {'weight': 0.68, 'flow_type': 'primary', 'latency_ms': 45, 'bandwidth': 85}),
    
    # Secondary processing paths
    (1, 5, {'weight': 0.76, 'flow_type': 'secondary', 'latency_ms': 20, 'bandwidth': 80}),
    (5, 6, {'weight': 0.58, 'flow_type': 'secondary', 'latency_ms': 30, 'bandwidth': 75}),
    (6, 7, {'weight': 0.83, 'flow_type': 'secondary', 'latency_ms': 40, 'bandwidth': 70}),
    (7, 8, {'weight': 0.94, 'flow_type': 'secondary', 'latency_ms': 50, 'bandwidth': 65}),
    
    # Cross-connections and feedback loops
    (2, 5, {'weight': 0.64, 'flow_type': 'cross_connect', 'latency_ms': 12, 'bandwidth': 60}),
    (3, 6, {'weight': 0.79, 'flow_type': 'cross_connect', 'latency_ms': 18, 'bandwidth': 55}),
    (4, 7, {'weight': 0.66, 'flow_type': 'cross_connect', 'latency_ms': 22, 'bandwidth': 50}),
    
    # Specialized processing branches
    (4, 9, {'weight': 0.43, 'flow_type': 'fallback', 'latency_ms': 60, 'bandwidth': 40}),
    (8, 10, {'weight': 0.87, 'flow_type': 'primary', 'latency_ms': 28, 'bandwidth': 85}),
    (10, 11, {'weight': 0.92, 'flow_type': 'primary', 'latency_ms': 32, 'bandwidth': 90}),
    (11, 12, {'weight': 0.96, 'flow_type': 'primary', 'latency_ms': 38, 'bandwidth': 95}),
    
    # Alternative and backup paths
    (9, 12, {'weight': 0.71, 'flow_type': 'fallback', 'latency_ms': 55, 'bandwidth': 45}),
    (6, 10, {'weight': 0.54, 'flow_type': 'shortcut', 'latency_ms': 25, 'bandwidth': 35}),
    (7, 11, {'weight': 0.61, 'flow_type': 'shortcut', 'latency_ms': 30, 'bandwidth': 40}),
    
    # Advanced processing nodes
    (12, 13, {'weight': 0.88, 'flow_type': 'post_process', 'latency_ms': 20, 'bandwidth': 80}),
    (13, 14, {'weight': 0.82, 'flow_type': 'post_process', 'latency_ms': 15, 'bandwidth': 75}),
    (14, 15, {'weight': 0.95, 'flow_type': 'output', 'latency_ms': 10, 'bandwidth': 100}),
    
    # Quality assurance and validation
    (11, 16, {'weight': 0.77, 'flow_type': 'validation', 'latency_ms': 35, 'bandwidth': 60}),
    (16, 13, {'weight': 0.84, 'flow_type': 'validation', 'latency_ms': 25, 'bandwidth': 65}),
    
    # Feedback and learning loops
    (15, 17, {'weight': 0.69, 'flow_type': 'feedback', 'latency_ms': 45, 'bandwidth': 30}),
    (17, 1, {'weight': 0.33, 'flow_type': 'feedback', 'latency_ms': 80, 'bandwidth': 20}),
]

# Add all edges to graph
for u, v, attrs in edges:
    G.add_edge(u, v, **attrs)

logger.info(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# 2. Define comprehensive concept mappings with hierarchical structure
concepts = {
    1: {
        'primary': ["User Input Handler", "Request Router", "Entry Point"],
        'secondary': ["Authentication", "Rate Limiting", "Input Validation"],
        'technical': ["HTTP Parser", "JSON Decoder", "Request Sanitizer"],
        'metadata': NodeMetadata(15.2, 0.98, 0.02, 1000, [], "input_handler")
    },
    2: {
        'primary': ["Intent Classification", "NLU Engine", "Semantic Parser"],
        'secondary': ["Language Detection", "Sentiment Analysis", "Toxicity Filter"],
        'technical': ["BERT Embeddings", "Classification Head", "Attention Mechanism"],
        'metadata': NodeMetadata(45.8, 0.92, 0.08, 500, [1], "ml_processor")
    },
    3: {
        'primary': ["Named Entity Recognition", "Entity Linking", "Concept Extraction"],
        'secondary': ["Coreference Resolution", "Temporal Extraction", "Relation Extraction"],
        'technical': ["BiLSTM-CRF", "Knowledge Graph Lookup", "Fuzzy Matching"],
        'metadata': NodeMetadata(38.4, 0.89, 0.11, 400, [1, 2], "nlp_processor")
    },
    4: {
        'primary': ["Context Understanding", "Semantic Reasoning", "Knowledge Integration"],
        'secondary': ["Inference Engine", "Logical Reasoning", "Contradiction Detection"],
        'technical': ["Graph Neural Networks", "Transformer Architecture", "Attention Pooling"],
        'metadata': NodeMetadata(125.6, 0.85, 0.15, 200, [2, 3], "reasoning_engine")
    },
    5: {
        'primary': ["Query Expansion", "Synonym Generation", "Concept Broadening"],
        'secondary': ["Word Sense Disambiguation", "Morphological Analysis", "Lemmatization"],
        'technical': ["Word2Vec", "GloVe Embeddings", "Semantic Similarity"],
        'metadata': NodeMetadata(28.9, 0.94, 0.06, 600, [1], "text_processor")
    },
    6: {
        'primary': ["Context Aggregation", "Multi-source Fusion", "Information Synthesis"],
        'secondary': ["Conflict Resolution", "Source Weighting", "Confidence Scoring"],
        'technical': ["Ensemble Methods", "Weighted Voting", "Bayesian Fusion"],
        'metadata': NodeMetadata(67.3, 0.91, 0.09, 300, [3, 5], "aggregator")
    },
    7: {
        'primary': ["Memory Access", "Knowledge Retrieval", "Historical Context"],
        'secondary': ["Cache Management", "Index Optimization", "Query Planning"],
        'technical': ["Vector Database", "Approximate Nearest Neighbor", "Inverted Index"],
        'metadata': NodeMetadata(52.1, 0.96, 0.04, 800, [6], "memory_system")
    },
    8: {
        'primary': ["Embedding Generation", "Semantic Representation", "Vector Space Mapping"],
        'secondary': ["Dimensionality Reduction", "Feature Selection", "Normalization"],
        'technical': ["Sentence-BERT", "Universal Sentence Encoder", "Contrastive Learning"],
        'metadata': NodeMetadata(89.7, 0.87, 0.13, 250, [4, 7], "embedding_engine")
    },
    9: {
        'primary': ["Error Recovery", "Fallback Processing", "Graceful Degradation"],
        'secondary': ["Circuit Breaker", "Retry Logic", "Default Responses"],
        'technical': ["Rule-based System", "Template Matching", "Statistical Fallback"],
        'metadata': NodeMetadata(12.4, 0.75, 0.25, 1500, [4], "fallback_handler")
    },
    10: {
        'primary': ["Content Summarization", "Information Distillation", "Key Point Extraction"],
        'secondary': ["Abstractive Summary", "Extractive Summary", "Multi-document Summary"],
        'technical': ["T5 Model", "BART", "Pointer-Generator Network"],
        'metadata': NodeMetadata(156.8, 0.83, 0.17, 150, [8], "summarizer")
    },
    11: {
        'primary': ["Response Generation", "Natural Language Generation", "Content Creation"],
        'secondary': ["Style Transfer", "Tone Adjustment", "Personalization"],
        'technical': ["GPT Architecture", "Beam Search", "Nucleus Sampling"],
        'metadata': NodeMetadata(234.5, 0.88, 0.12, 100, [10], "generator")
    },
    12: {
        'primary': ["Output Formatting", "Response Structuring", "Final Assembly"],
        'secondary': ["Template Application", "Markup Generation", "Media Embedding"],
        'technical': ["JSON Serialization", "HTML Generation", "Content Negotiation"],
        'metadata': NodeMetadata(18.7, 0.97, 0.03, 900, [11], "formatter")
    },
    13: {
        'primary': ["Quality Assessment", "Content Validation", "Fact Checking"],
        'secondary': ["Hallucination Detection", "Bias Detection", "Toxicity Screening"],
        'technical': ["Discriminator Networks", "Fact Verification API", "Bias Metrics"],
        'metadata': NodeMetadata(76.3, 0.86, 0.14, 350, [12, 16], "quality_checker")
    },
    14: {
        'primary': ["Post-processing", "Content Enhancement", "Final Polishing"],
        'secondary': ["Grammar Correction", "Style Optimization", "Readability Enhancement"],
        'technical': ["Language Tool", "Style Transfer Models", "Readability Metrics"],
        'metadata': NodeMetadata(42.9, 0.93, 0.07, 450, [13], "post_processor")
    },
    15: {
        'primary': ["Response Delivery", "Output Gateway", "User Interface"],
        'secondary': ["Response Caching", "CDN Distribution", "Rate Limiting"],
        'technical': ["HTTP Response", "WebSocket", "Server-Sent Events"],
        'metadata': NodeMetadata(8.1, 0.99, 0.01, 2000, [14], "output_handler")
    },
    16: {
        'primary': ["Validation Engine", "Compliance Checker", "Safety Filter"],
        'secondary': ["Policy Enforcement", "Content Moderation", "Ethical Guidelines"],
        'technical': ["Rule Engine", "ML Classifiers", "Blacklist Filtering"],
        'metadata': NodeMetadata(58.4, 0.95, 0.05, 400, [11], "validator")
    },
    17: {
        'primary': ["Feedback Collector", "Performance Monitor", "Learning System"],
        'secondary': ["Usage Analytics", "Error Tracking", "Model Evaluation"],
        'technical': ["Telemetry", "A/B Testing", "Online Learning"],
        'metadata': NodeMetadata(25.6, 0.90, 0.10, 700, [15], "feedback_system")
    }
}

# Apply concepts and metadata to nodes
logger.info("Applying node concepts and metadata...")
for node_id, node_data in concepts.items():
    G.nodes[node_id]['concepts'] = node_data['primary']
    G.nodes[node_id]['secondary_concepts'] = node_data['secondary']
    G.nodes[node_id]['technical_details'] = node_data['technical']
    G.nodes[node_id]['metadata'] = node_data['metadata']

# 3. Define multiple traversal paths for different scenarios
traversal_paths = {
    'primary_flow': [1, 2, 3, 4, 8, 10, 11, 12, 13, 14, 15],
    'fast_track': [1, 5, 6, 10, 11, 12, 15],
    'fallback_flow': [1, 2, 4, 9, 12, 15],
    'validation_heavy': [1, 2, 3, 4, 8, 10, 11, 16, 13, 14, 15],
    'feedback_loop': [1, 2, 3, 4, 8, 10, 11, 12, 13, 14, 15, 17, 1]
}

# 4. Initialize visualizer and perform analysis
logger.info("Initializing graph visualizer...")
visualizer = GraphVisualizer()

# 5. Simulate comprehensive filtered content for each path
filtered_content = {
    1: """Entry Point Analysis:
    - Received user query: "What's the weather like in Nairobi today?"
    - Request authenticated and validated
    - Input sanitized and normalized
    - Routing to NLU pipeline initiated""",
    
    2: """Intent Classification Results:
    - Primary intent: WEATHER_QUERY (confidence: 0.96)
    - Secondary intents: LOCATION_QUERY (0.34), TIME_QUERY (0.28)
    - Language detected: English (confidence: 0.99)
    - Sentiment: Neutral (0.02)
    - No toxicity detected""",
    
    3: """Named Entity Recognition:
    - Location: "Nairobi" (CITY, confidence: 0.94)
    - Time: "today" (DATE, confidence: 0.87)
    - Linked entities: Nairobi -> Q3870 (Wikidata)
    - Geographical coordinates: -1.2921°, 36.8219°
    - Timezone: EAT (UTC+3)""",
    
    4: """Semantic Reasoning Engine:
    - Context: Weather information request for specific location and time
    - Temporal resolution: Current date (2025-06-17)
    - Spatial resolution: Nairobi metropolitan area
    - Required data sources: Weather API, Location services
    - Confidence in understanding: 0.91""",
    
    5: """Query Expansion Results:
    - Synonyms: ["climate", "atmospheric conditions", "meteorological data"]
    - Related terms: ["temperature", "humidity", "precipitation", "forecast"]
    - Location expansions: ["Nairobi Kenya", "Nairobi East Africa"]
    - Temporal expansions: ["current weather", "today's forecast"]""",
    
    6: """Context Aggregation:
    - Combined NER results with intent classification
    - Integrated temporal and spatial constraints
    - Merged query expansions with original query
    - Confidence weighted fusion applied
    - Final context score: 0.89""",
    
    7: """Memory System Access:
    - Cache lookup for recent weather queries: HIT
    - Historical weather patterns for Nairobi: Retrieved
    - User preference data: No specific weather preferences found
    - Previous similar queries: 3 matches in last 24h
    - Memory access latency: 52ms""",
    
    8: """Embedding Generation:
    - Query embedding: 768-dimensional vector generated
    - Semantic similarity to cached queries: 0.87
    - Weather pattern embeddings: Computed for seasonal context
    - Location embeddings: Enhanced with geographical features
    - Embedding quality score: 0.91""",
    
    9: """Fallback Processing (if triggered):
    - Weather API unavailable fallback activated
    - Default response template selected
    - Historical weather averages retrieved
    - Graceful degradation message prepared
    - Fallback confidence: 0.75""",
    
    10: """Content Summarization:
    - Weather data sources identified and ranked
    - Key information extracted: Temperature, conditions, forecast
    - Redundant information filtered out
    - Summary coherence score: 0.88
    - Information density optimized""",
    
    11: """Response Generation:
    - Natural language response crafted
    - Personalization applied based on location
    - Conversational tone maintained
    - Technical details simplified for general audience
    - Generation quality score: 0.92""",
    
    12: """Output Formatting:
    - Response structured in user-friendly format
    - Weather icons and formatting applied
    - Metadata added for rich display
    - Mobile-responsive formatting applied
    - Format validation: PASSED""",
    
    13: """Quality Assessment:
    - Fact-checking weather data sources: VERIFIED
    - Hallucination detection: No issues found
    - Bias assessment: Geographic bias minimal
    - Content safety: APPROVED
    - Overall quality score: 0.89""",
    
    14: """Post-processing Enhancement:
    - Grammar and style optimization applied
    - Readability score: 8.2/10 (Good)
    - Tone consistency maintained
    - Cultural appropriateness verified
    - Final polish: COMPLETE""",
    
    15: """Response Delivery:
    - HTTP response prepared (200 OK)
    - Content-Type: application/json
    - Response size: 1.2KB
    - Cache headers set for 15 minutes
    - Delivery latency: 8ms""",
    
    16: """Validation Results:
    - Policy compliance: PASSED
    - Content moderation: APPROVED
    - Safety guidelines: COMPLIANT
    - Ethical review: No concerns
    - Validation confidence: 0.95""",
    
    17: """Feedback Collection:
    - User interaction logged
    - Performance metrics recorded
    - Model evaluation data captured
    - A/B test variant: Control group
    - Feedback loop: ACTIVE"""
}

# 6. Demonstrate different traversal scenarios
for path_name, path in traversal_paths.items():
    logger.info(f"Analyzing {path_name} traversal path...")
    print(f"\n{'='*50}")
    print(f"TRAVERSAL SCENARIO: {path_name.upper()}")
    print(f"{'='*50}")
    print(f"Path: {' -> '.join(map(str, path))}")
    
    # Calculate path metrics
    total_latency = sum(G[path[i]][path[i+1]]['latency_ms'] for i in range(len(path)-1) if G.has_edge(path[i], path[i+1]))
    avg_confidence = sum(concepts[node]['metadata'].confidence_score for node in path) / len(path)
    
    print(f"Total latency: {total_latency}ms")
    print(f"Average confidence: {avg_confidence:.3f}")
    print(f"Path length: {len(path)} nodes")
    
    # Visualize the traversal
    print(f"\nVisualizing {path_name} traversal...")
    visualizer.visualize_traversal(G, path)
    
    # Print detailed content for key nodes in path
    print(f"\nDetailed content for {path_name}:")
    key_nodes = path[::max(1, len(path)//5)]  # Sample every 5th node or so
    for node in key_nodes:
        if node in filtered_content:
            print(f"\n--- NODE {node}: {concepts[node]['primary'][0]} ---")
            print(filtered_content[node])

# 7. Export graph analysis results
logger.info("Exporting graph analysis results...")
graph_stats = {
    'total_nodes': G.number_of_nodes(),
    'total_edges': G.number_of_edges(),
    'average_degree': sum(dict(G.degree()).values()) / G.number_of_nodes(),
    'is_connected': nx.is_connected(G),
    'diameter': nx.diameter(G) if nx.is_connected(G) else 'N/A',
    'clustering_coefficient': nx.average_clustering(G),
    'node_types': {node_data['metadata'].node_type for node_data in concepts.values()}
}

print(f"\n{'='*60}")
print("GRAPH ANALYSIS SUMMARY")
print(f"{'='*60}")
for key, value in graph_stats.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

logger.info("Complex graph prototype analysis complete!")

In [None]:
degree_centrality = nx.degree_centrality(graph)
sampled_nodes = sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:200]

In [None]:
degree_centrality