In [None]:
from PubMedDownloader import PubMedEntrezDownloader
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

async def main():
    downloader = PubMedEntrezDownloader("olandechris@gmail.com")
    pmids = await downloader.search_pubmed("", max_results=10000)
    articles = await downloader.fetch_article_details(pmids)
    downloader.save_to_json(articles, "results.json")

await main()

In [None]:
from document_processor import DocumentProcessor
docs = DocumentProcessor()
docs.get_stats(documents)

In [None]:
import asyncio
from tqdm.asyncio import tqdm_asyncio
import logging
from batchprocessor import PMCBatchProcessor
from document_processor import DocumentProcessor

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("MainScript")

async def main():
    doc_processor = DocumentProcessor(embeddings_model="BAAI/bge-small-en-v1.5")

    batch_processor = PMCBatchProcessor(
        document_processor=doc_processor,
        batch_size=96,
        max_concurrent_batches=3,
        retry_attempts=2,
        retry_delay=1.0,
        inter_batch_delay=0.1
    )

    file_path = "../data/research20250605_002659.json" 
    output_directory = "../data/output/processed_pmc_data"

    logger.info(f"Starting batch processing of {file_path}")

    try:
        processing_results = await batch_processor.process_pmc_file_async(
            file_path=file_path
        )

        batch_processor.save_results(processing_results, output_directory, save_batch_details=True)

        s = processing_results['processing_summary']
        logger.info(f"Processing complete: {s['total_documents']:,} docs → {s['total_chunks']:,} chunks ({s['processing_time']:.1f}s)")
        logger.info(f"Success rate: {s['success_rate']:.1f}%")

    except Exception as e:
        logger.error(f"An error occurred during batch processing: {e}", exc_info=True)


if __name__ == "__main__":
    await main()

In [4]:
import json
from pathlib import Path
from langchain.schema import Document

data_path = Path("../data/output/processed_pmc_data/pmc_chunks.json")
with data_path.open(encoding="utf-8") as f:
    data = json.load(f)

# Create documents
documents = [
    Document(page_content=doc["content"], metadata=doc["metadata"])
    for doc in data["documents"]
]

In [5]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

In [6]:
from src.nlp.vectorstore import VectorStore

vector_store = VectorStore()

2025-06-26 12:00:12,361 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-06-26 12:00:12,363 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-26 12:01:04,218 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-06-26 12:01:04,220 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-26 12:01:04,409 - src.core.utils - INFO - Semantic cache initialized and set globally.
2025-06-26 12:01:05,156 - src.nlp.vectorstore - INFO - Attempting to load index from ../faiss_index...
2025-06-26 12:01:05,163 - faiss.loader - INFO - Loading faiss.
2025-06-26 12:01:05,276 - faiss.loader - INFO - Successfully loaded faiss.
2025-06-26 12:01:05,294 - faiss - INFO - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU i

In [4]:
# await vector_store._create_vector_index(documents)

In [7]:
from langchain_openai import ChatOpenAI
import os
llm = ChatOpenAI(model="meta-llama/llama-3.3-70b-instruct",
                                api_key=os.getenv("OPENROUTER_API_KEY"),
                                openai_api_base="https://openrouter.ai/api/v1",
                                temperature=0,
                                streaming=False)

In [8]:
from src.knowledge_graph.knowledge_graph import KnowledgeGraph
kg = KnowledgeGraph(
    cache_dir = "../my_cache",
    batch_size=100,  # Process 50 documents at a time with spaCy
    # max_concurrent_llm_calls=10
)

# Build the knowledge graph
graph = await kg.build_knowledge_graph(documents, llm)

2025-06-26 12:01:12,410 - src.core.common_helpers - INFO - CacheManager initialized. Cache directory: ../my_cache
2025-06-26 12:01:15,801 - src.core.common_helpers - INFO - Cache loaded from ../my_cache/cache.pkl.
The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


Loaded existing graph with 17330 nodes and 1815 edges
Knowledge graph already exists with 17330 nodes and 1815 edges


In [9]:
from src.nlp.rag_chain import QueryEngine
engine = QueryEngine(vector_store = vector_store, knowledge_graph = kg, llm = llm)
# vector_store.retrieve_relevant_documents("What is the effect of Gaza war on Children?", filter_threshold = 0.5)

2025-06-26 12:01:16,220 - src.nlp.rag_chain - INFO - QueryEngine initialized.


In [None]:
query = "what are the effects of the Gaza war on children?"

In [None]:

response, traversal_path, filtered_content = await engine.query(query)

In [None]:
from src.knowledge_graph.graph_viz import GraphVisualizer
visualizer = GraphVisualizer()

In [None]:
await visualizer.visualize_traversal_async(graph, traversal_path)

In [None]:
import networkx as nx
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
# from graph_viz import GraphVisualizer
import json
import random

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class NodeMetadata:
    """Enhanced metadata for each node."""
    processing_time_ms: float
    confidence_score: float
    error_rate: float
    throughput_capacity: int
    dependencies: List[int] = field(default_factory=list)
    node_type: str = "standard"

# 1. Create a comprehensive knowledge processing graph
logger.info("Building complex knowledge processing graph...")
G = nx.Graph()

# Define complex edge relationships with multiple attributes
edges = [
    # Primary knowledge flow path
    (1, 2, {'weight': 0.85, 'flow_type': 'primary', 'latency_ms': 15, 'bandwidth': 100}),
    (2, 3, {'weight': 0.72, 'flow_type': 'primary', 'latency_ms': 25, 'bandwidth': 95}),
    (3, 4, {'weight': 0.91, 'flow_type': 'primary', 'latency_ms': 35, 'bandwidth': 90}),
    (4, 8, {'weight': 0.68, 'flow_type': 'primary', 'latency_ms': 45, 'bandwidth': 85}),
    
    # Secondary processing paths
    (1, 5, {'weight': 0.76, 'flow_type': 'secondary', 'latency_ms': 20, 'bandwidth': 80}),
    (5, 6, {'weight': 0.58, 'flow_type': 'secondary', 'latency_ms': 30, 'bandwidth': 75}),
    (6, 7, {'weight': 0.83, 'flow_type': 'secondary', 'latency_ms': 40, 'bandwidth': 70}),
    (7, 8, {'weight': 0.94, 'flow_type': 'secondary', 'latency_ms': 50, 'bandwidth': 65}),
    
    # Cross-connections and feedback loops
    (2, 5, {'weight': 0.64, 'flow_type': 'cross_connect', 'latency_ms': 12, 'bandwidth': 60}),
    (3, 6, {'weight': 0.79, 'flow_type': 'cross_connect', 'latency_ms': 18, 'bandwidth': 55}),
    (4, 7, {'weight': 0.66, 'flow_type': 'cross_connect', 'latency_ms': 22, 'bandwidth': 50}),
    
    # Specialized processing branches
    (4, 9, {'weight': 0.43, 'flow_type': 'fallback', 'latency_ms': 60, 'bandwidth': 40}),
    (8, 10, {'weight': 0.87, 'flow_type': 'primary', 'latency_ms': 28, 'bandwidth': 85}),
    (10, 11, {'weight': 0.92, 'flow_type': 'primary', 'latency_ms': 32, 'bandwidth': 90}),
    (11, 12, {'weight': 0.96, 'flow_type': 'primary', 'latency_ms': 38, 'bandwidth': 95}),
    
    # Alternative and backup paths
    (9, 12, {'weight': 0.71, 'flow_type': 'fallback', 'latency_ms': 55, 'bandwidth': 45}),
    (6, 10, {'weight': 0.54, 'flow_type': 'shortcut', 'latency_ms': 25, 'bandwidth': 35}),
    (7, 11, {'weight': 0.61, 'flow_type': 'shortcut', 'latency_ms': 30, 'bandwidth': 40}),
    
    # Advanced processing nodes
    (12, 13, {'weight': 0.88, 'flow_type': 'post_process', 'latency_ms': 20, 'bandwidth': 80}),
    (13, 14, {'weight': 0.82, 'flow_type': 'post_process', 'latency_ms': 15, 'bandwidth': 75}),
    (14, 15, {'weight': 0.95, 'flow_type': 'output', 'latency_ms': 10, 'bandwidth': 100}),
    
    # Quality assurance and validation
    (11, 16, {'weight': 0.77, 'flow_type': 'validation', 'latency_ms': 35, 'bandwidth': 60}),
    (16, 13, {'weight': 0.84, 'flow_type': 'validation', 'latency_ms': 25, 'bandwidth': 65}),
    
    # Feedback and learning loops
    (15, 17, {'weight': 0.69, 'flow_type': 'feedback', 'latency_ms': 45, 'bandwidth': 30}),
    (17, 1, {'weight': 0.33, 'flow_type': 'feedback', 'latency_ms': 80, 'bandwidth': 20}),
]

# Add all edges to graph
for u, v, attrs in edges:
    G.add_edge(u, v, **attrs)

logger.info(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# 2. Define comprehensive concept mappings with hierarchical structure
concepts = {
    1: {
        'primary': ["User Input Handler", "Request Router", "Entry Point"],
        'secondary': ["Authentication", "Rate Limiting", "Input Validation"],
        'technical': ["HTTP Parser", "JSON Decoder", "Request Sanitizer"],
        'metadata': NodeMetadata(15.2, 0.98, 0.02, 1000, [], "input_handler")
    },
    2: {
        'primary': ["Intent Classification", "NLU Engine", "Semantic Parser"],
        'secondary': ["Language Detection", "Sentiment Analysis", "Toxicity Filter"],
        'technical': ["BERT Embeddings", "Classification Head", "Attention Mechanism"],
        'metadata': NodeMetadata(45.8, 0.92, 0.08, 500, [1], "ml_processor")
    },
    3: {
        'primary': ["Named Entity Recognition", "Entity Linking", "Concept Extraction"],
        'secondary': ["Coreference Resolution", "Temporal Extraction", "Relation Extraction"],
        'technical': ["BiLSTM-CRF", "Knowledge Graph Lookup", "Fuzzy Matching"],
        'metadata': NodeMetadata(38.4, 0.89, 0.11, 400, [1, 2], "nlp_processor")
    },
    4: {
        'primary': ["Context Understanding", "Semantic Reasoning", "Knowledge Integration"],
        'secondary': ["Inference Engine", "Logical Reasoning", "Contradiction Detection"],
        'technical': ["Graph Neural Networks", "Transformer Architecture", "Attention Pooling"],
        'metadata': NodeMetadata(125.6, 0.85, 0.15, 200, [2, 3], "reasoning_engine")
    },
    5: {
        'primary': ["Query Expansion", "Synonym Generation", "Concept Broadening"],
        'secondary': ["Word Sense Disambiguation", "Morphological Analysis", "Lemmatization"],
        'technical': ["Word2Vec", "GloVe Embeddings", "Semantic Similarity"],
        'metadata': NodeMetadata(28.9, 0.94, 0.06, 600, [1], "text_processor")
    },
    6: {
        'primary': ["Context Aggregation", "Multi-source Fusion", "Information Synthesis"],
        'secondary': ["Conflict Resolution", "Source Weighting", "Confidence Scoring"],
        'technical': ["Ensemble Methods", "Weighted Voting", "Bayesian Fusion"],
        'metadata': NodeMetadata(67.3, 0.91, 0.09, 300, [3, 5], "aggregator")
    },
    7: {
        'primary': ["Memory Access", "Knowledge Retrieval", "Historical Context"],
        'secondary': ["Cache Management", "Index Optimization", "Query Planning"],
        'technical': ["Vector Database", "Approximate Nearest Neighbor", "Inverted Index"],
        'metadata': NodeMetadata(52.1, 0.96, 0.04, 800, [6], "memory_system")
    },
    8: {
        'primary': ["Embedding Generation", "Semantic Representation", "Vector Space Mapping"],
        'secondary': ["Dimensionality Reduction", "Feature Selection", "Normalization"],
        'technical': ["Sentence-BERT", "Universal Sentence Encoder", "Contrastive Learning"],
        'metadata': NodeMetadata(89.7, 0.87, 0.13, 250, [4, 7], "embedding_engine")
    },
    9: {
        'primary': ["Error Recovery", "Fallback Processing", "Graceful Degradation"],
        'secondary': ["Circuit Breaker", "Retry Logic", "Default Responses"],
        'technical': ["Rule-based System", "Template Matching", "Statistical Fallback"],
        'metadata': NodeMetadata(12.4, 0.75, 0.25, 1500, [4], "fallback_handler")
    },
    10: {
        'primary': ["Content Summarization", "Information Distillation", "Key Point Extraction"],
        'secondary': ["Abstractive Summary", "Extractive Summary", "Multi-document Summary"],
        'technical': ["T5 Model", "BART", "Pointer-Generator Network"],
        'metadata': NodeMetadata(156.8, 0.83, 0.17, 150, [8], "summarizer")
    },
    11: {
        'primary': ["Response Generation", "Natural Language Generation", "Content Creation"],
        'secondary': ["Style Transfer", "Tone Adjustment", "Personalization"],
        'technical': ["GPT Architecture", "Beam Search", "Nucleus Sampling"],
        'metadata': NodeMetadata(234.5, 0.88, 0.12, 100, [10], "generator")
    },
    12: {
        'primary': ["Output Formatting", "Response Structuring", "Final Assembly"],
        'secondary': ["Template Application", "Markup Generation", "Media Embedding"],
        'technical': ["JSON Serialization", "HTML Generation", "Content Negotiation"],
        'metadata': NodeMetadata(18.7, 0.97, 0.03, 900, [11], "formatter")
    },
    13: {
        'primary': ["Quality Assessment", "Content Validation", "Fact Checking"],
        'secondary': ["Hallucination Detection", "Bias Detection", "Toxicity Screening"],
        'technical': ["Discriminator Networks", "Fact Verification API", "Bias Metrics"],
        'metadata': NodeMetadata(76.3, 0.86, 0.14, 350, [12, 16], "quality_checker")
    },
    14: {
        'primary': ["Post-processing", "Content Enhancement", "Final Polishing"],
        'secondary': ["Grammar Correction", "Style Optimization", "Readability Enhancement"],
        'technical': ["Language Tool", "Style Transfer Models", "Readability Metrics"],
        'metadata': NodeMetadata(42.9, 0.93, 0.07, 450, [13], "post_processor")
    },
    15: {
        'primary': ["Response Delivery", "Output Gateway", "User Interface"],
        'secondary': ["Response Caching", "CDN Distribution", "Rate Limiting"],
        'technical': ["HTTP Response", "WebSocket", "Server-Sent Events"],
        'metadata': NodeMetadata(8.1, 0.99, 0.01, 2000, [14], "output_handler")
    },
    16: {
        'primary': ["Validation Engine", "Compliance Checker", "Safety Filter"],
        'secondary': ["Policy Enforcement", "Content Moderation", "Ethical Guidelines"],
        'technical': ["Rule Engine", "ML Classifiers", "Blacklist Filtering"],
        'metadata': NodeMetadata(58.4, 0.95, 0.05, 400, [11], "validator")
    },
    17: {
        'primary': ["Feedback Collector", "Performance Monitor", "Learning System"],
        'secondary': ["Usage Analytics", "Error Tracking", "Model Evaluation"],
        'technical': ["Telemetry", "A/B Testing", "Online Learning"],
        'metadata': NodeMetadata(25.6, 0.90, 0.10, 700, [15], "feedback_system")
    }
}

# Apply concepts and metadata to nodes
logger.info("Applying node concepts and metadata...")
for node_id, node_data in concepts.items():
    G.nodes[node_id]['concepts'] = node_data['primary']
    G.nodes[node_id]['secondary_concepts'] = node_data['secondary']
    G.nodes[node_id]['technical_details'] = node_data['technical']
    G.nodes[node_id]['metadata'] = node_data['metadata']

# 3. Define multiple traversal paths for different scenarios
traversal_paths = {
    'primary_flow': [1, 2, 3, 4, 8, 10, 11, 12, 13, 14, 15],
    'fast_track': [1, 5, 6, 10, 11, 12, 15],
    'fallback_flow': [1, 2, 4, 9, 12, 15],
    'validation_heavy': [1, 2, 3, 4, 8, 10, 11, 16, 13, 14, 15],
    'feedback_loop': [1, 2, 3, 4, 8, 10, 11, 12, 13, 14, 15, 17, 1]
}

# 4. Initialize visualizer and perform analysis
logger.info("Initializing graph visualizer...")
visualizer = GraphVisualizer()

# 5. Simulate comprehensive filtered content for each path
filtered_content = {
    1: """Entry Point Analysis:
    - Received user query: "What's the weather like in Nairobi today?"
    - Request authenticated and validated
    - Input sanitized and normalized
    - Routing to NLU pipeline initiated""",
    
    2: """Intent Classification Results:
    - Primary intent: WEATHER_QUERY (confidence: 0.96)
    - Secondary intents: LOCATION_QUERY (0.34), TIME_QUERY (0.28)
    - Language detected: English (confidence: 0.99)
    - Sentiment: Neutral (0.02)
    - No toxicity detected""",
    
    3: """Named Entity Recognition:
    - Location: "Nairobi" (CITY, confidence: 0.94)
    - Time: "today" (DATE, confidence: 0.87)
    - Linked entities: Nairobi -> Q3870 (Wikidata)
    - Geographical coordinates: -1.2921°, 36.8219°
    - Timezone: EAT (UTC+3)""",
    
    4: """Semantic Reasoning Engine:
    - Context: Weather information request for specific location and time
    - Temporal resolution: Current date (2025-06-17)
    - Spatial resolution: Nairobi metropolitan area
    - Required data sources: Weather API, Location services
    - Confidence in understanding: 0.91""",
    
    5: """Query Expansion Results:
    - Synonyms: ["climate", "atmospheric conditions", "meteorological data"]
    - Related terms: ["temperature", "humidity", "precipitation", "forecast"]
    - Location expansions: ["Nairobi Kenya", "Nairobi East Africa"]
    - Temporal expansions: ["current weather", "today's forecast"]""",
    
    6: """Context Aggregation:
    - Combined NER results with intent classification
    - Integrated temporal and spatial constraints
    - Merged query expansions with original query
    - Confidence weighted fusion applied
    - Final context score: 0.89""",
    
    7: """Memory System Access:
    - Cache lookup for recent weather queries: HIT
    - Historical weather patterns for Nairobi: Retrieved
    - User preference data: No specific weather preferences found
    - Previous similar queries: 3 matches in last 24h
    - Memory access latency: 52ms""",
    
    8: """Embedding Generation:
    - Query embedding: 768-dimensional vector generated
    - Semantic similarity to cached queries: 0.87
    - Weather pattern embeddings: Computed for seasonal context
    - Location embeddings: Enhanced with geographical features
    - Embedding quality score: 0.91""",
    
    9: """Fallback Processing (if triggered):
    - Weather API unavailable fallback activated
    - Default response template selected
    - Historical weather averages retrieved
    - Graceful degradation message prepared
    - Fallback confidence: 0.75""",
    
    10: """Content Summarization:
    - Weather data sources identified and ranked
    - Key information extracted: Temperature, conditions, forecast
    - Redundant information filtered out
    - Summary coherence score: 0.88
    - Information density optimized""",
    
    11: """Response Generation:
    - Natural language response crafted
    - Personalization applied based on location
    - Conversational tone maintained
    - Technical details simplified for general audience
    - Generation quality score: 0.92""",
    
    12: """Output Formatting:
    - Response structured in user-friendly format
    - Weather icons and formatting applied
    - Metadata added for rich display
    - Mobile-responsive formatting applied
    - Format validation: PASSED""",
    
    13: """Quality Assessment:
    - Fact-checking weather data sources: VERIFIED
    - Hallucination detection: No issues found
    - Bias assessment: Geographic bias minimal
    - Content safety: APPROVED
    - Overall quality score: 0.89""",
    
    14: """Post-processing Enhancement:
    - Grammar and style optimization applied
    - Readability score: 8.2/10 (Good)
    - Tone consistency maintained
    - Cultural appropriateness verified
    - Final polish: COMPLETE""",
    
    15: """Response Delivery:
    - HTTP response prepared (200 OK)
    - Content-Type: application/json
    - Response size: 1.2KB
    - Cache headers set for 15 minutes
    - Delivery latency: 8ms""",
    
    16: """Validation Results:
    - Policy compliance: PASSED
    - Content moderation: APPROVED
    - Safety guidelines: COMPLIANT
    - Ethical review: No concerns
    - Validation confidence: 0.95""",
    
    17: """Feedback Collection:
    - User interaction logged
    - Performance metrics recorded
    - Model evaluation data captured
    - A/B test variant: Control group
    - Feedback loop: ACTIVE"""
}

# 6. Demonstrate different traversal scenarios
for path_name, path in traversal_paths.items():
    logger.info(f"Analyzing {path_name} traversal path...")
    print(f"\n{'='*50}")
    print(f"TRAVERSAL SCENARIO: {path_name.upper()}")
    print(f"{'='*50}")
    print(f"Path: {' -> '.join(map(str, path))}")
    
    # Calculate path metrics
    total_latency = sum(G[path[i]][path[i+1]]['latency_ms'] for i in range(len(path)-1) if G.has_edge(path[i], path[i+1]))
    avg_confidence = sum(concepts[node]['metadata'].confidence_score for node in path) / len(path)
    
    print(f"Total latency: {total_latency}ms")
    print(f"Average confidence: {avg_confidence:.3f}")
    print(f"Path length: {len(path)} nodes")
    
    # Visualize the traversal
    print(f"\nVisualizing {path_name} traversal...")
    visualizer.visualize_traversal(G, path)
    
    # Print detailed content for key nodes in path
    print(f"\nDetailed content for {path_name}:")
    key_nodes = path[::max(1, len(path)//5)]  # Sample every 5th node or so
    for node in key_nodes:
        if node in filtered_content:
            print(f"\n--- NODE {node}: {concepts[node]['primary'][0]} ---")
            print(filtered_content[node])

# 7. Export graph analysis results
logger.info("Exporting graph analysis results...")
graph_stats = {
    'total_nodes': G.number_of_nodes(),
    'total_edges': G.number_of_edges(),
    'average_degree': sum(dict(G.degree()).values()) / G.number_of_nodes(),
    'is_connected': nx.is_connected(G),
    'diameter': nx.diameter(G) if nx.is_connected(G) else 'N/A',
    'clustering_coefficient': nx.average_clustering(G),
    'node_types': {node_data['metadata'].node_type for node_data in concepts.values()}
}

print(f"\n{'='*60}")
print("GRAPH ANALYSIS SUMMARY")
print(f"{'='*60}")
for key, value in graph_stats.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

logger.info("Complex graph prototype analysis complete!")

In [None]:
degree_centrality = nx.degree_centrality(graph)
sampled_nodes = sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:200]

In [None]:
degree_centrality

In [11]:
from ragas.embeddings.base import HuggingfaceEmbeddings as RagasHuggingfaceEmbeddings

In [14]:
import asyncio
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, answer_correctness, context_recall
from langchain_huggingface import HuggingFaceEmbeddings

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

@dataclass
class EvaluationResult:
    """Results from RAGAs evaluation."""
    scores: Dict[str, float]
    detailed_results: pd.DataFrame
    avg_score: float

@dataclass
class QueryEvaluationData:
    """Data structure for single query evaluation."""
    question: str
    answer: str
    contexts: List[str]
    ground_truth: str

class RAGAsEvaluator:
    """RAGAs evaluator for QueryEngine."""
    
    def __init__(self, query_engine, llm=None):
        """Initialize evaluator with QueryEngine instance."""
        self.query_engine = query_engine
        self.llm = llm or query_engine.llm
        self.metrics = [
            answer_relevancy,
            answer_correctness,
            faithfulness,
            context_precision,
            context_recall,
        ]
        logger.info("RAGAsEvaluator initialized")

        # Initialize HuggingFace embeddings for RAGAs
        model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Upgraded model for better semantic matching
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)

    def _extract_contexts_from_filtered_content(self, filtered_content: Dict[Any, str]) -> List[str]:
        """Extract contexts from QueryEngine filtered content."""
        try:
            return [content for content in filtered_content.values() if isinstance(content, str) and content.strip()]
        except AttributeError as e:
            logger.error(f"Invalid filtered_content format: {e}")
            return []

    async def _prepare_evaluation_data(self, queries_and_ground_truths: List[Tuple[str, str]]) -> List[QueryEvaluationData]:
        """Prepare evaluation data by running queries through QueryEngine."""
        evaluation_data = []
        
        for query, ground_truth in queries_and_ground_truths:
            try:
                logger.info(f"Processing query: {query[:50]}...")
                
                # Query the engine
                answer, traversal_path, filtered_content = await self.query_engine.query(query)
                
                # Extract answer content
                answer_text = answer.content if hasattr(answer, 'content') else str(answer)
                
                # Extract contexts
                contexts = self._extract_contexts_from_filtered_content(filtered_content)
                
                evaluation_data.append(QueryEvaluationData(
                    question=query,
                    answer=answer_text,
                    contexts=contexts,
                    ground_truth=ground_truth
                ))
                
            except Exception as e:
                logger.error(f"Error processing query '{query[:50]}...': {e}")
                continue
                
        return evaluation_data

    def _create_ragas_dataset(self, evaluation_data: List[QueryEvaluationData]) -> Dataset:
        """Create RAGAs dataset from evaluation data."""
        dataset_dict = {
            "question": [item.question for item in evaluation_data],
            "answer": [item.answer for item in evaluation_data],
            "contexts": [item.contexts for item in evaluation_data],
            "ground_truth": [item.ground_truth for item in evaluation_data]
        }
        
        return Dataset.from_dict(dataset_dict)

    async def evaluate_queries(self, queries_and_ground_truths: List[Tuple[str, str]]) -> EvaluationResult:
        """Evaluate multiple queries using RAGAs metrics."""
        logger.info(f"Starting evaluation of {len(queries_and_ground_truths)} queries")
        
        # Prepare evaluation data
        evaluation_data = await self._prepare_evaluation_data(queries_and_ground_truths)
        
        if not evaluation_data:
            raise ValueError("No evaluation data prepared successfully")
            
        # Create RAGAs dataset
        dataset = self._create_ragas_dataset(evaluation_data)
        
        # Run evaluation with error handling for individual metrics
        try:
            results = evaluate(
                dataset=dataset,
                metrics=self.metrics,
                llm=self.llm,
                embeddings=self.embeddings,
                raise_exceptions=False  # Prevent RAGAs from raising exceptions for individual metric failures
            )
            
            # Process results (handle RAGAs output correctly)
            scores = {}
            for metric in self.metrics:
                try:
                    score = results[metric.name] if hasattr(metric, 'name') else results.get(metric, 0.0)
                    if isinstance(score, (int, float)):
                        scores[metric.name] = score
                except Exception as e:
                    logger.warning(f"Failed to extract score for metric {metric.name}: {e}")
                    scores[metric.name] = 0.0
            
            avg_score = sum(scores.values()) / len(scores) if scores else 0.0
            
            # Create detailed DataFrame
            try:
                detailed_df = pd.DataFrame({
                    'question': dataset['question'],
                    'answer': dataset['answer'],
                    'contexts': dataset['contexts'],
                    'ground_truth': dataset['ground_truth'],
                    **{metric.name: results.get(metric.name, [0.0] * len(dataset)) for metric in self.metrics}
                })
            except Exception as e:
                logger.error(f"Failed to create detailed DataFrame: {e}")
                detailed_df = pd.DataFrame()
            
            logger.info(f"Evaluation completed. Average score: {avg_score:.3f}")
            
            return EvaluationResult(
                scores=scores,
                detailed_results=detailed_df,
                avg_score=avg_score
            )
            
        except Exception as e:
            logger.error(f"RAGAs evaluation failed: {e}")
            raise

    def print_evaluation_summary(self, result: EvaluationResult) -> None:
        """Print evaluation summary."""
        print("\n" + "="*60)
        print("RAGAs EVALUATION SUMMARY")
        print("="*60)
        
        print(f"Average Score: {result.avg_score:.3f}")
        print("\nMetric Scores:")
        for metric, score in result.scores.items():
            print(f"  {metric}: {score:.3f}")
            
        print(f"\nDetailed Results Shape: {result.detailed_results.shape}")
        print("\nTop 3 Best Performing Queries:")
        if 'answer_relevancy' in result.detailed_results.columns:
            top_queries = result.detailed_results.nlargest(3, 'answer_relevancy')
            for idx, row in top_queries.iterrows():
                print(f"  Q: {row['question'][:50]}... (Score: {row['answer_relevancy']:.3f})")
        else:
            print("  No answer_relevancy scores available.")

# Example usage
async def evaluate_query_engine(query_engine, test_queries: List[Tuple[str, str]]) -> EvaluationResult:
    """
    Evaluate QueryEngine with test queries.
    
    Args:
        query_engine: Your QueryEngine instance
        test_queries: List of (query, ground_truth) tuples
    
    Returns:
        EvaluationResult with scores and detailed results
    """
    evaluator = RAGAsEvaluator(query_engine)
    return await evaluator.evaluate_queries(test_queries)

# Example usage
if __name__ == "__main__":
    # Assuming `engine` and `data` are defined elsewhere
    test_queries = [
        (doc.get("question"), doc.get("answer"))
        for doc in data["qa_pairs"]
    ]
    test_queries = test_queries[:10]
    
    # Run async evaluation
    async def main():
        try:
            result = await evaluate_query_engine(query_engine=engine, test_queries=test_queries)
            evaluator = RAGAsEvaluator(query_engine=engine)
            evaluator.print_evaluation_summary(result)
        except Exception as e:
            logger.error(f"Evaluation failed: {e}")
    
    # Execute the async main function
    await main()

2025-06-26 12:09:46,982 - __main__ - INFO - RAGAsEvaluator initialized
2025-06-26 12:09:47,003 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-06-26 12:09:47,008 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


2025-06-26 12:09:47,578 - __main__ - INFO - Starting evaluation of 5 queries
2025-06-26 12:09:47,579 - __main__ - INFO - Processing query: What is introduced to improve radiologic image ana...
2025-06-26 12:09:47,580 - src.nlp.rag_chain - INFO - Starting query for: 'What is introduced to improve radiologic image analysis?'
2025-06-26 12:09:49,520 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:49,523 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:49,527 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:49,532 - src.nlp.rag_chain - INFO - Chunk analysis - Count: 3, Avg length: 22.0 words, Range: 22-22 words


Document 1:
Metadata: {'id': 4, 'relevance_score': np.float32(0.9997913), 'pmid': '40434863', 'title': 'RadCLIP: Enhancing Radiologic Image Analysis Through Contrastive Language-Image Pretraining.', 'authors': 'Lu Z; Li H; Parikh NA; Dillman JR; He L', 'journal': 'IEEE transactions on neural networks and learning systems', 'volume': 'PP', 'issue': '', 'year': '2025', 'month': 'May', 'day': '28', 'pub_date': '2025 May 28', 'doi': '10.1109/TNNLS.2025.3568036', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40434863/', 'doi_url': 'https://doi.org/10.1109/TNNLS.2025.3568036'}

radiologic contrastive language-image pretraining (RadCLIP): a cross-modal
vision-language foundational model that utilizes a vision-language pretraining
(VLP) framework to improve radiologic image analysis.
--------------------------------------------------------------------------------
Document 2:
Metadata: {'id': 3, 'relevance_score': np.float

2025-06-26 12:09:49,865 - src.nlp.rag_chain - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 12:09:49,866 - src.nlp.rag_chain - INFO - Generating final answer from accumulated context.
2025-06-26 12:09:49,870 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:49,874 - src.nlp.rag_chain - INFO - Final Answer: radiologic contrastive language-image pretraining (RadCLIP): a cross-modal vision-language foundational model that utilizes a vision-language pretraining (VLP) framework to improve radiologic image an...
2025-06-26 12:09:49,875 - __main__ - INFO - Processing query: What framework does RadCLIP utilize to improve rad...
2025-06-26 12:09:49,877 - src.nlp.rag_chain - INFO - Starting query for: 'What framework does RadCLIP utilize to improve radiologic image analysis?'



Step 1 - Node 3648:
Content: required in radiologic imaging, we introduce radiologic contrastive language-image pretraining (RadC...
Concepts: stepwise ESDPT, VLP, EFNs, ESDPT process, metabolic profiling, ESDPT mechanism, LC-MS
--------------------------------------------------

Step 2 - Node 3649:
Content: radiologic image analysis. Building on the contrastive language-image pretraining (CLIP) approach, R...
Concepts: CLIP, stepwise ESDPT, EFNs, ESDPT process, metabolic profiling, RadCLIP, ESDPT mechanism, radiologic image analysis, LC-MS
--------------------------------------------------


2025-06-26 12:09:51,468 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:51,471 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:51,476 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:51,481 - src.nlp.rag_chain - INFO - Chunk analysis - Count: 3, Avg length: 22.0 words, Range: 22-22 words


Document 1:
Metadata: {'id': 4, 'relevance_score': np.float32(0.9998777), 'pmid': '40434863', 'title': 'RadCLIP: Enhancing Radiologic Image Analysis Through Contrastive Language-Image Pretraining.', 'authors': 'Lu Z; Li H; Parikh NA; Dillman JR; He L', 'journal': 'IEEE transactions on neural networks and learning systems', 'volume': 'PP', 'issue': '', 'year': '2025', 'month': 'May', 'day': '28', 'pub_date': '2025 May 28', 'doi': '10.1109/TNNLS.2025.3568036', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40434863/', 'doi_url': 'https://doi.org/10.1109/TNNLS.2025.3568036'}

radiologic contrastive language-image pretraining (RadCLIP): a cross-modal
vision-language foundational model that utilizes a vision-language pretraining
(VLP) framework to improve radiologic image analysis.
--------------------------------------------------------------------------------
Document 2:
Metadata: {'id': 1, 'relevance_score': np.float

2025-06-26 12:09:51,789 - src.nlp.rag_chain - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 12:09:51,790 - src.nlp.rag_chain - INFO - Generating final answer from accumulated context.
2025-06-26 12:09:51,794 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:51,799 - src.nlp.rag_chain - INFO - Final Answer: radiologic contrastive language-image pretraining (RadCLIP): a cross-modal vision-language foundational model that utilizes a vision-language pretraining (VLP) framework to improve radiologic image an...
2025-06-26 12:09:51,801 - __main__ - INFO - Processing query: What is the main focus of the specialized algorith...
2025-06-26 12:09:51,802 - src.nlp.rag_chain - INFO - Starting query for: 'What is the main focus of the specialized algorithms in large-scale sequencing datasets?'



Step 1 - Node 3648:
Content: required in radiologic imaging, we introduce radiologic contrastive language-image pretraining (RadC...
Concepts: stepwise ESDPT, VLP, EFNs, ESDPT process, metabolic profiling, ESDPT mechanism, LC-MS
--------------------------------------------------

Step 2 - Node 3649:
Content: radiologic image analysis. Building on the contrastive language-image pretraining (CLIP) approach, R...
Concepts: CLIP, stepwise ESDPT, EFNs, ESDPT process, metabolic profiling, RadCLIP, ESDPT mechanism, radiologic image analysis, LC-MS
--------------------------------------------------


2025-06-26 12:09:52,939 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:52,941 - src.nlp.rag_chain - INFO - Chunk analysis - Count: 1, Avg length: 4.0 words, Range: 4-4 words
2025-06-26 12:09:52,998 - src.nlp.rag_chain - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 12:09:52,999 - src.nlp.rag_chain - INFO - Generating final answer from accumulated context.
2025-06-26 12:09:53,003 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:53,006 - src.nlp.rag_chain - INFO - Final Answer: managing large-scale genomic data...
2025-06-26 12:09:53,007 - __main__ - INFO - Processing query: What have innovations in this field enhanced?...
2025-06-26 12:09:53,009 - src.nlp.rag_chain - INFO - Starting query for: 'What have innovations in this field enhanced?'


Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.5104168), 'pmid': '40444293', 'title': 'Enhancing Data Compression: Recent Innovations in LZ77 Algorithms.', 'authors': 'Hong A; Boucher C', 'journal': 'Journal of computational biology : a journal of computational molecular cell biology', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '30', 'pub_date': '2025 May 30', 'doi': '10.1089/cmb.2024.0879', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article; Review', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40444293/', 'doi_url': 'https://doi.org/10.1089/cmb.2024.0879'}

managing large-scale genomic data

Step 1 - Node 820:
Content: redundancy within genomic datasets. We critically examine a spectrum of LZ77-based algorithms, inclu...
Concepts: 
--------------------------------------------------


2025-06-26 12:09:53,758 - __main__ - INFO - Processing query: Where were FITC positive endometriotic lesions obs...
2025-06-26 12:09:53,763 - src.nlp.rag_chain - INFO - Starting query for: 'Where were FITC positive endometriotic lesions observed after 6 weeks of endometriosis induction?'
2025-06-26 12:09:57,559 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:57,565 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:57,570 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:57,574 - src.nlp.rag_chain - INFO - Chunk analysis - Count: 3, Avg length: 33.0 words, Range: 33-33 words


Document 1:
Metadata: {'id': 1, 'relevance_score': np.float32(0.9999707), 'pmid': '40445172', 'title': 'The Development of a Preclinical Swine Model for Endometriosis.', 'authors': 'Omenge H; Barrier BF; Monarch K; Kiesewetter E; Schlink S; Sponchiado M; Prather RS; Geisert RD; Kim T; Shin JH; Kim TH; Lee K; Jeong JW', 'journal': 'Biology of reproduction', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '30', 'pub_date': '2025 May 30', 'doi': '10.1093/biolre/ioaf118', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40445172/', 'doi_url': 'https://doi.org/10.1093/biolre/ioaf118'}

After 6 weeks of endometriosis induction, FITC positive endometriotic lesions
were observed on the peritoneal surface of the abdominal wall as well as on the
serosal surfaces of the uterus and small intestine.
--------------------------------------------------------------------------------
Document 2:
Metadata: {'id': 5, '

2025-06-26 12:09:58,241 - src.nlp.rag_chain - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 12:09:58,242 - src.nlp.rag_chain - INFO - Generating final answer from accumulated context.
2025-06-26 12:09:58,248 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,250 - src.nlp.rag_chain - INFO - Final Answer: After 6 weeks of endometriosis induction, FITC positive endometriotic lesions were observed on the peritoneal surface of the abdominal wall as well as on the serosal surfaces of the uterus and small i...



Step 1 - Node 9012:
Content: tissue fragments labeled with fluorescein isothiocyanate dye-doped silica nanoparticles. After 6 wee...
Concepts: FITC
--------------------------------------------------


Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

2025-06-26 12:09:58,437 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,438 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,448 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,452 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,438 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,457 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,458 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,463 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,465 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,438 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,469 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,474 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 12:09:58,475 - src.core.common_helpers - INFO - Memory cache hit
2025-06-26 1


RAGAs EVALUATION SUMMARY
Average Score: 0.000

Metric Scores:

Detailed Results Shape: (0, 0)

Top 3 Best Performing Queries:
  No answer_relevancy scores available.


In [2]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import LLMLinguaCompressor
from langchain_openai import ChatOpenAI

# llm = ChatOpenAI(temperature=0)

compressor = LLMLinguaCompressor(model_name="openai-community/gpt2", device_map="cpu")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="neuml/pubmedbert-base-embeddings")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
import logging
import os
from huggingface_hub import scan_cache_dir, snapshot_download
from transformers import GPT2LMHeadModel, GPT2Tokenizer

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Force clear any partial downloads
try:
    cache_info = scan_cache_dir()
    for repo in cache_info.repos:
        if "gpt2" in repo.repo_id.lower():
            delete_strategy = cache_info.delete_revisions(repo.repo_id)
            delete_strategy.execute()
            logger.info(f"Cleared cache for {repo.repo_id}")
except Exception as e:
    logger.error(f"Cache clear failed: {e}")

# Force fresh download with resume=False
try:
    logger.info("Starting fresh download...")
    model = GPT2LMHeadModel.from_pretrained(
        "openai-community/gpt2",
        resume_download=False,  # Force fresh download
        force_download=True     # Skip cache check
    )
    tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
    logger.info("Download completed successfully")
except Exception as e:
    logger.error(f"Download failed: {e}")

Revision(s) not found - cannot delete them: openai-community/gpt2
INFO:__main__:Cleared cache for openai-community/gpt2
INFO:__main__:Starting fresh download...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [1]:
from transformers import file_utils
print(file_utils.default_cache_path)

/home/olande/.cache/huggingface/hub


In [11]:
import json
with open("gold_standard_dataset.json", "r") as f:
    data = json.load(f)
    # test_queries = data['qa_pairs'].get("question")

In [3]:
test_queries = []
for doc in data["qa_pairs"]:
    test_queries.append(doc.get("question"))

In [70]:
test_queries[:5]

['What is introduced to improve radiologic image analysis?',
 'What framework does RadCLIP utilize to improve radiologic image analysis?',
 'What is the main focus of the specialized algorithms in large-scale sequencing datasets?',
 'What have innovations in this field enhanced?',
 'Where were FITC positive endometriotic lesions observed after 6 weeks of endometriosis induction?']

In [None]:
test_queries = [doc.mdata["qa_pairs"]]

[{'id': 'qa_0000',
  'question': 'What is introduced to improve radiologic image analysis?',
  'answer': 'Radiologic contrastive language-image pretraining (RadCLIP)',
  'source': None,
  'context': 'required in radiologic imaging, we introduce radiologic contrastive language-image pretraining (RadCLIP): a cross-modal vision-language foundational model that utilizes a vision-language pretraining (VLP) framework to improve radiologic image analysis.',
  'difficulty': 'easy',
  'type': 'factual',
  'validation': 'pending'},
 {'id': 'qa_0001',
  'question': 'What framework does RadCLIP utilize to improve radiologic image analysis?',
  'answer': 'A vision-language pretraining (VLP) framework',
  'source': None,
  'context': 'a cross-modal vision-language foundational model that utilizes a vision-language pretraining (VLP) framework to improve radiologic image analysis',
  'difficulty': 'easy',
  'type': 'factual',
  'validation': 'pending'},
 {'id': 'qa_0002',
  'question': 'What is the ma

In [1]:
import json
from pathlib import Path
from langchain.schema import Document

data_path = Path("../data/input/research20250605_002659.json")
with data_path.open(encoding="utf-8") as f:
    data = json.load(f)



In [2]:
docs = [doc for doc in data if doc.get("abstract") and doc.get("abstract").strip()]

In [3]:
docs[0].items()

dict_items([('pmid', '40407163'), ('title', 'Longitudinal changes in choroidal thickness and choroidal vascularity index in\xa0myopic children treated with 0.01% and 0.05% atropine eye drops every other day: one-year results.'), ('abstract', 'To investigate the effect of two different concentrations of atropine eye drops on choroidal thickness (ChT) and choroidal vascularity index (CVI) in a Caucasian child population after one year of treatment. The medical records of patients who received atropine eye drops every other day (0.01% and 0.05%) due to myopia progression were retrospectively reviewed. Demographic data and ocular biometry were recorded. The ChT (subfoveal, 1000\u2009µm nasal, and 1000\u2009µm temporal) was measured by spectral domain optical coherence tomography. Total choroidal area (TCA), luminal area (LA), stromal area (SA), and CVI were assessed with ImageJ software. Comparisons were made between baseline and 1-year data. Sixty-eight eyes of 34 (25 female, 73.5%) patie

In [4]:
from langchain_core.documents import Document
docs = [Document(
    page_content = doc.get("abstract"),
    metadata = {k: v for k, v in doc.items() if k != "abstract"}
) for doc in docs]

In [5]:
from langchain_openai import ChatOpenAI
import os
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = ChatOpenAI(model="meta-llama/llama-3.3-70b-instruct",
                                api_key=os.getenv("OPENROUTER_API_KEY"),
                                openai_api_base="https://openrouter.ai/api/v1",
                                temperature=0,
                                streaming=False)

In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
generator_llm = LangchainLLMWrapper(llm)
generator_embeddings = LangchainEmbeddingsWrapper(embeddings)

In [7]:
import random
random.seed(69)

docs = random.sample(docs, 100)

In [9]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=30)

Applying SummaryExtractor:   0%|          | 0/97 [00:00<?, ?it/s]

unable to apply transformation: Invalid json output: Bcl-3 plays a crucial role in immune responses and preventing autoimmune diseases. Its depletion reduces IFN-γ production and increases recruitment of MDSCs, which are critical in protecting Bcl-3 deficient mice from liver injury. Bcl-3 deficiency enhances MDSC differentiation and CXCR4 mediates MDSC recruitment to the liver. These findings suggest that Bcl-3 regulates liver MDSCs\" immunosuppressive function, influencing liver injury extent in mice, and offer insights for drug development and treatment strategies.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 


Applying CustomNodeFilter:   0%|          | 0/100 [00:00<?, ?it/s]

Node 98bddffe-012c-4ed1-ad12-84114ccf1506 does not have a summary. Skipping filtering.
Node 73723d31-9675-41a4-8508-98b83d48f7eb does not have a summary. Skipping filtering.
Node e7c9889e-8ecc-41e2-b861-0238c3d01b7a does not have a summary. Skipping filtering.
Node 2d038529-0935-4e6f-9d45-25a64ef1232e does not have a summary. Skipping filtering.


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/297 [00:00<?, ?it/s]

unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

unable to apply transformation: Node 2d038529-0935-4e6f-9d45-25a64ef1232e has no summary_embedding


Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
dataset.to_pandas().to_csv("dataset.csv", index=False)

In [37]:
df = dataset.to_pandas()

In [46]:
predictions = []
for index, row in df.iterrows():
    question = row['user_input']
    context = row['reference_contexts']  # Use this if your model accepts context
    # Generate answer (modify based on your model's API)
    answer = await engine.query(question)  # Or pass context if supported
    predictions.append({
        'user_input': question,
        'reference_contexts': context,
        'answer': answer,
        'reference': row['reference'],
        'synthesizer_name': row['synthesizer_name']
    })

# Convert to DataFrame
import pandas as pd
results_df = pd.DataFrame(predictions)

2025-06-26 10:20:25,745 - INFO - Starting query for: 'What is the efect of excersise on DNA?'
2025-06-26 10:20:29,811 - INFO - Starting query for: 'What was the purpose and outcome of the RISE study in relation to predicting iron biomarkers in blood donors?'
2025-06-26 10:20:33,192 - INFO - Loaded FAISS index from ../semantic_cache_index
2025-06-26 10:20:33,676 - INFO - Best match score 0.6210 above threshold (0.5).
2025-06-26 10:20:40,344 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:20:42,096 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:20:42,538 - INFO - Semantic match found with score 0.1144.
2025-06-26 10:20:43,007 - INFO - Semantic match found with score 0.1937.
2025-06-26 10:20:43,013 - INFO - Chunk analysis - Count: 3, Avg length: 37.0 words, Range: 37-37 words


Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.9834968), 'pmid': '40447352', 'title': 'Machine-learning models to predict iron recovery after blood donation: a model development and external validation study.', 'authors': 'Li W; Su CY; Meulenbeld A; Jagirdar H; Janssen MP; Swanevelder R; Bruhn R; Kaidarova Z; Bravo MD; Cao S; Custer B; van den Berg K; Russell WA', 'journal': 'The Lancet. Haematology', 'volume': '12', 'issue': '6', 'year': '2025', 'month': 'Jun', 'day': '', 'pub_date': '2025 Jun', 'doi': '10.1016/S2352-3026(25)00068-7', 'pmc_id': '', 'mesh_terms': 'Humans; Machine Learning; Blood Donors; Ferritins; Male; Female; Iron; Adult; Hemoglobins; Retrospective Studies; Middle Aged; Biomarkers; Adolescent; Young Adult; Blood Donation', 'publication_types': 'Journal Article; Validation Study', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40447352/', 'doi_url': 'https://doi.org/10.1016/S2352-3026(25)00068-7'}

Machine-learning models directly predicting iron biom

2025-06-26 10:20:43,922 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 10:20:43,928 - INFO - Generating final answer from accumulated context.



Step 1 - Node 4176:
Content: Machine-learning models directly predicting iron biomarkers after blood donation could help to manag...
Concepts: 
--------------------------------------------------


2025-06-26 10:20:44,340 - INFO - Semantic match found with score 0.2413.
2025-06-26 10:20:44,343 - INFO - Final Answer: Machine-learning models directly predicting iron biomarkers after blood donation could help to manage donation-associated iron deficiency and avoid low haemoglobin deferrals. No such models have been ...
2025-06-26 10:20:44,346 - INFO - Starting query for: 'what GaAs?'
2025-06-26 10:20:46,477 - INFO - Starting query for: 'What is CO2 reduction used for?'
2025-06-26 10:20:48,521 - INFO - Starting query for: 'What triggers radiation-induced cardiac remodeling?'
2025-06-26 10:20:50,447 - INFO - Best match score 0.8014 above threshold (0.5).
2025-06-26 10:20:54,573 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:21:02,052 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:21:02,057 - INFO - Chunk analysis - Count: 1, Avg length: 36.0 words, Range: 36-36 words


Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.99971706), 'pmid': '40439873', 'title': 'Radiation-Induced Cardiac Remodeling: Mechanisms and Therapeutic Approaches.', 'authors': 'Ghnim ZS; Adhab AH; Mahdi MS; Kyada A; Roopashree R; Thakur V; Kaur M; Gupta A; Mansoor AS; Radi UK; Abd NS; Kadhim M', 'journal': 'Cardiovascular toxicology', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '29', 'pub_date': '2025 May 29', 'doi': '10.1007/s12012-025-10006-6', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article; Review', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40439873/', 'doi_url': 'https://doi.org/10.1007/s12012-025-10006-6'}

Radiation-induced cardiac remodeling (RICR) is one of the complications of
exposure to radiotherapy. These disorders may occur for a subset of cancer
patients, when the heart remains in part or in full in the radiation field.

Step 1 - Node 15118:
Content: Radiation-induced cardiac remodeling (RICR) is one of 

2025-06-26 10:21:12,707 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 10:21:12,711 - INFO - Generating final answer from accumulated context.



Step 2 - Node 15119:
Content: field. Despite advancements in radiotherapy techniques, cardiotoxicity has remained a concern after ...
Concepts: RICR
--------------------------------------------------


2025-06-26 10:21:13,084 - INFO - Semantic match found with score 0.2594.
2025-06-26 10:21:13,087 - INFO - Final Answer: Radiation-induced cardiac remodeling (RICR) is one of the complications of exposure to radiotherapy. These disorders may occur for a subset of cancer patients, when the heart remains in part or in ful...
2025-06-26 10:21:13,089 - INFO - Starting query for: 'What is the significance of the study on oral cancer screening in India, particularly in the context of primary health care settings?'
2025-06-26 10:21:15,245 - INFO - Best match score 0.6772 above threshold (0.5).
2025-06-26 10:21:20,787 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:21:21,188 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:21:21,481 - INFO - Semantic match found with score 0.0842.
2025-06-26 10:21:21,880 - INFO - Semantic match found with score 0.0863.
2025-06-26 10:21:21,884 - INFO - Chunk analysis - Count

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.98192894), 'pmid': '40435503', 'title': "Combining virtual reality-based positive mental imagery and dual tasking increases children's willingness to exposure.", 'authors': 'Bragt-de Jong HJ; Dejonckheere E; Smeets T; Lodder P; Karreman A', 'journal': 'JMIR research protocols', 'volume': '14', 'issue': '', 'year': '2025', 'month': 'May', 'day': '28', 'pub_date': '2025 May 28', 'doi': '10.2196/66285', 'pmc_id': '', 'mesh_terms': 'Humans; Mouth Neoplasms; Early Detection of Cancer; Longitudinal Studies; Prospective Studies; India; Tolonium Chloride; Risk Management; Precancerous Conditions', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40435503/', 'doi_url': 'https://doi.org/10.2196/66285'}

workers, evaluation of these noninvasive adjuncts is likely to assist and
strengthen the population-wide oral cancer screening in high-burden countries
such as India. This prospective longitudina

2025-06-26 10:21:22,225 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 10:21:22,227 - INFO - Generating final answer from accumulated context.



Step 1 - Node 16976:
Content: workers, evaluation of these noninvasive adjuncts is likely to assist and strengthen the population-...
Concepts: Here are 1-3 key concepts from the text as a comma-separated list:, population-wide oral cancer screening, oral cancer screening, India, noninvasive adjuncts
--------------------------------------------------


2025-06-26 10:21:22,764 - INFO - Semantic match found with score 0.1338.
2025-06-26 10:21:22,766 - INFO - Final Answer: workers, evaluation of these noninvasive adjuncts is likely to assist and strengthen the population-wide oral cancer screening in high-burden countries such as India. This prospective longitudinal stu...
2025-06-26 10:21:22,768 - INFO - Starting query for: 'What is PIV and how does it relate to bone metastasis in prostate cancer patients, can you explain it in detail like you talking to someone who dont know much about medical stuff?'
2025-06-26 10:21:23,884 - INFO - Best match score 0.9292 above threshold (0.5).
2025-06-26 10:21:26,828 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:21:31,542 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:21:31,546 - INFO - Chunk analysis - Count: 1, Avg length: 40.0 words, Range: 40-40 words
2025-06-26 10:21:31,686 - INFO - Priority queue exha

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.65118104), 'pmid': '40438419', 'title': 'Geriatric Nutritional Risk Index (GNRI) and Prognostic Nutritional Index (PNI) Before Treatment as the Predictive Indicators for Bone Metastasis in Prostate Cancer Patients.', 'authors': 'Chen L; Rao H; Chen N; Li R; Chen D; Jiang H', 'journal': 'International journal of general medicine', 'volume': '18', 'issue': '', 'year': '2025', 'month': '', 'day': '', 'pub_date': '2025', 'doi': '10.2147/IJGM.S516768', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40438419/', 'doi_url': 'https://doi.org/10.2147/IJGM.S516768'}

Prostate cancer is more common in older men; about a quarter of patients have
bone metastasis. GNRI and PNI have predictive efficacy in bone metastasis and
multiple bone metastasis of prostate cancer, but NAR, PIV, SII, and SIRI do not.

Step 1 - Node 1789:
Content: plus PNI was 0.647. Prostate cance

2025-06-26 10:21:32,000 - INFO - Semantic match found with score 0.1577.
2025-06-26 10:21:32,002 - INFO - Final Answer: Prostate cancer is more common in older men; about a quarter of patients have bone metastasis. GNRI and PNI have predictive efficacy in bone metastasis and multiple bone metastasis of prostate cancer,...
2025-06-26 10:21:32,004 - INFO - Starting query for: 'What is PCN224?'
2025-06-26 10:21:32,623 - INFO - Starting query for: 'What are the key characteristics and diagnostic challenges associated with IgG4-related disease, and how does it relate to other immune-mediated conditions such as MPO-ANCA-associated vasculitis?'
2025-06-26 10:21:34,216 - INFO - Best match score 0.9418 above threshold (0.5).
2025-06-26 10:21:36,864 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:21:37,789 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:21:37,792 - INFO - Chunk analysis - Count: 1, Avg leng

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.96942276), 'pmid': '40438183', 'title': 'MPO-ANCA-Associated Hypertrophic Pachymeningitis Mimicking IgG4-Related Disease: A Case Report and Literature Review.', 'authors': 'Chen Y; Liu L; Xie C', 'journal': 'Journal of inflammation research', 'volume': '18', 'issue': '', 'year': '2025', 'month': '', 'day': '', 'pub_date': '2025', 'doi': '10.2147/JIR.S521138', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Case Reports; Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40438183/', 'doi_url': 'https://doi.org/10.2147/JIR.S521138'}

Immune-mediated causes, particularly antineutrophil cytoplasmic antibody
(ANCA)-associated vasculitis and IgG4-related disease (IgG4-RD), are among the
most common etiologies.

Step 1 - Node 2226:
Content: Immune-mediated causes, particularly antineutrophil cytoplasmic antibody (ANCA)-associated vasculiti...
Concepts: ANCA)-associated, case report of hypertrophic pachymeningi

2025-06-26 10:21:38,191 - INFO - Semantic match found with score 0.0903.
2025-06-26 10:21:38,196 - INFO - Final Answer: Immune-mediated causes, particularly antineutrophil cytoplasmic antibody (ANCA)-associated vasculitis and IgG4-related disease (IgG4-RD), are among the most common etiologies....
2025-06-26 10:21:38,197 - INFO - Starting query for: 'What is MMD and how does it affect peoples body?'
2025-06-26 10:21:38,956 - INFO - Starting query for: 'What is the significance of BMI in the context of eating disorder symptoms?'
2025-06-26 10:21:40,109 - INFO - Semantic match found with score 0.4712.
2025-06-26 10:21:40,374 - INFO - Best match score 0.5249 above threshold (0.5).
2025-06-26 10:21:42,496 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:21:43,316 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:21:43,533 - INFO - Semantic match found with score 0.1850.
2025-06-26 10:21:43,539 - INFO - C

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.9864746), 'pmid': '40436112', 'title': 'Risk factors for eating disorder symptoms at 15 years of age: a 9-year longitudinal cohort study.', 'authors': 'Hanson LN; Adamson AJ; Basterfield L; Reilly JJ; Janssen X; Pearce MS; Boothroyd LG; Evans EH', 'journal': 'Appetite', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '26', 'pub_date': '2025 May 26', 'doi': '10.1016/j.appet.2025.108149', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40436112/', 'doi_url': 'https://doi.org/10.1016/j.appet.2025.108149'}

Machine-learning models directly predicting iron biomarkers after blood donation
could help to manage donation-associated iron deficiency and avoid low
haemoglobin deferrals. No such models have been externally validated
internationally. Our aim was to develop and externally validate
-----------------------------------------------------

2025-06-26 10:21:44,063 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:44,085 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin


Step 4 - Node 15440:
Content: Eating disorders (EDs) are typically diagnosed in the later stages of puberty, but risk factors for ...
Concepts: Eating disorders, risk factors, Here are 1-3 key concepts from the text as a comma-separated list:, longitudinal study
--------------------------------------------------


2025-06-26 10:21:44,448 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:44,453 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin


Step 5 - Node 15445:
Content: found that depressive symptoms at 12 partially mediated the relationship between body dissatisfactio...
Concepts: body dissatisfaction, Here are 1-3 key concepts from the text as a comma-separated list:, Depressive symptoms, pubertal development
--------------------------------------------------


2025-06-26 10:21:44,837 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:44,840 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin


Step 6 - Node 15444:
Content: symptoms. We found that previous eating disorder symptoms were the strongest predictor of eating dis...
Concepts: depressive symptoms, Here are 1-3 key concepts from the text as a comma-separated list:, depressive symptoms and eating disorder symptoms, eating disorder
--------------------------------------------------


2025-06-26 10:21:45,260 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:45,266 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin


Step 7 - Node 15441:
Content: predictors of eating disorder symptoms in 15-year-olds. Specifically, we sought to test an adapted d...
Concepts: aetiology, Here are 1-3 key concepts from the text as a comma-separated list:, disordered eating, predictors
--------------------------------------------------


2025-06-26 10:21:45,691 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:45,696 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin


Step 8 - Node 4176:
Content: Machine-learning models directly predicting iron biomarkers after blood donation could help to manag...
Concepts: 
--------------------------------------------------


2025-06-26 10:21:46,480 - INFO - Semantic match found with score 0.4891.
2025-06-26 10:21:46,487 - ERROR - Error during answer check: Structured Output response does not have a 'parsed' field nor a 'refusal' field. Received message:

content='Measures included body image, depressive symptoms, and pubertal development; we also measured BMI at each age.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 186, 'total_tokens': 210, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'id': 'gen-1750922501-07jGAxbMAUIx6iBqp9Ze', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--bf976b50-bfd4-4a99-a3c9-1d1ffb519a78-0' usage_metadata={'input_tokens': 186, 'output_tokens': 24, 'total_tokens': 210, 'input_token_details': {}, 'output_token_details': {}}
Traceback (most recent call last):
  File "/home/olande/Desktop/Fin

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.817325), 'pmid': '40440511', 'title': 'Portrayal of the Idealized, Mythical Beauty of Medieval Europe: Isolde the Fair in Tristan and Isolde.', 'authors': 'Jeon A; Hwang K; Han SH', 'journal': 'The Journal of craniofacial surgery', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '29', 'pub_date': '2025 May 29', 'doi': '10.1097/SCS.0000000000011525', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40440511/', 'doi_url': 'https://doi.org/10.1097/SCS.0000000000011525'}

The legend of Tristan and Isolde is a quintessential medieval European romance
that delves into themes of fate, passion, and the tragic consequences of
forbidden love. Central to this narrative is Isolde the Fair, whose beauty is
portrayed as both otherworldly and divine. Her allure transcends mere

Step 1 - Node 16523:
Content: The legend of Tristan and Isolde is a quinte

2025-06-26 10:22:01,372 - INFO - Semantic match found with score 0.2027.
2025-06-26 10:22:01,375 - INFO - Final Answer: The legend of Tristan and Isolde is a quintessential medieval European romance that delves into themes of fate, passion, and the tragic consequences of forbidden love. Central to this narrative is Iso...
2025-06-26 10:22:01,377 - INFO - Starting query for: 'What is WITS in the context of global food market research?'
2025-06-26 10:22:02,319 - INFO - Starting query for: 'What is europes role in gaseous reactant-involved heterogeneous catalysis?'
2025-06-26 10:22:02,915 - INFO - Starting query for: 'Whats the amniotik membrain used for?'
2025-06-26 10:22:03,565 - INFO - Starting query for: 'How WITS and ITS help in understanding fungal infections and global food market?'
2025-06-26 10:22:04,321 - INFO - Starting query for: 'What role do WITS and ITS play in understanding the impact of geopolitical events on global food markets and fungal infections in peritoneal dialysi

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.93093514), 'pmid': '40437394', 'title': 'Photodynamic therapeutic activity of novel porphyrins against lung squamous cell carcinoma.', 'authors': 'Meng H; Ding RQ; Jia L; Chen XP; Hu YH; Wang SM; Lv SQ; Feng F', 'journal': 'BMC cancer', 'volume': '25', 'issue': '1', 'year': '2025', 'month': 'May', 'day': '28', 'pub_date': '2025 May 28', 'doi': '10.1186/s12885-025-14386-4', 'pmc_id': '', 'mesh_terms': 'Photochemotherapy; Humans; Animals; Porphyrins; Lung Neoplasms; Cell Line, Tumor; Mice; Reactive Oxygen Species; Carcinoma, Squamous Cell; Xenograft Model Antitumor Assays; Photosensitizing Agents; Singlet Oxygen; Nanoparticles', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40437394/', 'doi_url': 'https://doi.org/10.1186/s12885-025-14386-4'}

Our findings highlight the photodynamic therapeutic potential of these novel
porphyrin compounds and their nanoparticles. These results not only

2025-06-26 10:22:13,383 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 10:22:13,384 - INFO - Generating final answer from accumulated context.



Step 1 - Node 5445:
Content: stronger antitumor effects compared to PCN224 in LSCC cells. Our findings highlight the photodynamic...
Concepts: LSCC
--------------------------------------------------


2025-06-26 10:22:13,746 - INFO - Semantic match found with score 0.1334.
2025-06-26 10:22:13,751 - INFO - Final Answer: Our findings highlight the photodynamic therapeutic potential of these novel porphyrin compounds and their nanoparticles. These results not only expand our understanding of porphyrins' antitumor capab...
2025-06-26 10:22:13,753 - INFO - Starting query for: 'What is the relashunship betwen LSCC and SCC in terms of therapeutik potenshal?'
2025-06-26 10:22:15,323 - INFO - Starting query for: 'What role does the CD4 complex play in the context of immune responses, and how does it relate to the concept of CD complexation solubilization in the utilization of polyphenols in biomass, considering the findings on Bcl-3 depletion and MDSC recruitment?'
2025-06-26 10:22:16,881 - INFO - Starting query for: 'What role does CD4 play in the context of immune responses and how does it relate to CD complexation in polyphenol utilization?'
2025-06-26 10:22:18,225 - INFO - Starting query

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.7985298), 'pmid': '40436305', 'title': 'Assessing heterogeneous pollution risks from polystyrene micro(nano)plastics and cadmium to physiology and biochemistry in parsley via a split-root system.', 'authors': 'Chen CC; Huang Z; Zhao X; Song Z; Gao M', 'journal': 'Phytochemistry', 'volume': '238', 'issue': '', 'year': '2025', 'month': 'May', 'day': '26', 'pub_date': '2025 May 26', 'doi': '10.1016/j.phytochem.2025.114565', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40436305/', 'doi_url': 'https://doi.org/10.1016/j.phytochem.2025.114565'}

Cd, which was attributed to the plant defense mechanism primarily activated by
Cd and promoted by excessive PS nanoplastics. This mechanism particularly
involved the antioxidant properties of non-flavonoid polyphenols

Step 1 - Node 12418:
Content: were exposed to Cd, which was attributed to the plant defense mechan

2025-06-26 10:22:28,141 - INFO - Semantic match found with score 0.1112.
2025-06-26 10:22:28,146 - INFO - Final Answer: Cd, which was attributed to the plant defense mechanism primarily activated by Cd and promoted by excessive PS nanoplastics. This mechanism particularly involved the antioxidant properties of non-flav...
2025-06-26 10:22:28,148 - INFO - Starting query for: 'How does HFS and HF affect patients and what are the treatment options for them, is there a preferable method of treatment for HFS patients?'
2025-06-26 10:22:29,605 - INFO - Best match score 0.5830 above threshold (0.5).
2025-06-26 10:22:31,443 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:22:31,779 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:22:31,782 - INFO - Starting query for: 'What are the implications of HFS and HF on treatment outcomes, considering the effectiveness of pretarsal botulinum toxin injection for HFS 

No relevant documents found for the query.


2025-06-26 10:22:33,630 - INFO - Semantic match found with score 0.4992.
2025-06-26 10:22:33,997 - INFO - Semantic match found with score 0.4972.
2025-06-26 10:22:34,004 - INFO - Starting query for: 'What is the relationship between the adaptive particle swarm optimization (APSO) algorithm used in medical image analysis and the assessment of attenuated psychotic symptoms (APS) in individuals with ultra-high risk (UHR) states, considering the stability of IPASE scores over time?'


No relevant documents found for the query.


2025-06-26 10:22:35,700 - INFO - Best match score 0.9606 above threshold (0.5).
2025-06-26 10:22:37,135 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-26 10:22:40,335 - INFO - Updated FAISS index and saved to ../semantic_cache_index
2025-06-26 10:22:40,340 - INFO - Chunk analysis - Count: 1, Avg length: 44.0 words, Range: 44-44 words
2025-06-26 10:22:40,444 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-26 10:22:40,445 - INFO - Generating final answer from accumulated context.


Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.015207555), 'pmid': '40440822', 'title': 'The inventory of psychotic-like anomalous self-experiences (IPASE): Stability and relationships with attenuated psychotic symptoms and remission in individuals at-risk for psychosis.', 'authors': 'Scott I; Selloni A; Bilgrami Z; Cotter M; Sarac C; McGowan A; Krcmar M; Formica M; Gwyther K; Wannan C; Srivastava A; Cecchi GA; Mizrahi R; McGorry P; Corcoran CM; Nelson B', 'journal': 'Schizophrenia research', 'volume': '281', 'issue': '', 'year': '2025', 'month': 'May', 'day': '28', 'pub_date': '2025 May 28', 'doi': '10.1016/j.schres.2025.05.003', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40440822/', 'doi_url': 'https://doi.org/10.1016/j.schres.2025.05.003'}

related to the severity or progression of attenuated psychotic symptoms (APS)
remains unclear. We examined the temporal stability of IPASE scores, their


2025-06-26 10:22:40,818 - INFO - Semantic match found with score 0.1405.
2025-06-26 10:22:40,821 - INFO - Final Answer: related to the severity or progression of attenuated psychotic symptoms (APS) remains unclear. We examined the temporal stability of IPASE scores, their correlation with APS, and whether they predict ...
2025-06-26 10:22:40,822 - INFO - Starting query for: 'How does APSO help in enhancing ultrasound image contrast for thyroid nodule diagnosis and what is its relation with APS in terms of predicting changes in symptoms over time?'
2025-06-26 10:22:41,585 - INFO - Starting query for: 'How does the application of Au nanoclusters, specifically those with hemilabile ligands like imidazolyl-phosphine, relate to the improvement of diagnostic models, such as the one using AUC to evaluate the performance of thyroid nodule screening, and what potential benefits could this integration offer in terms of catalytic activity and structural stability in medical imaging applications?'

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.97944206), 'pmid': '40439052', 'title': 'Imidazolyl Phosphine as Versatile Hemilabile Ligand: Synthesis and Characterization of a New Tetracationic Au(20) Nanocluster.', 'authors': 'Liu WY; Li YZ; Yu JH; Yuan ZR; Yang PF; Azam M; Sun D', 'journal': 'Chemistry, an Asian journal', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '29', 'pub_date': '2025 May 29', 'doi': '10.1002/asia.202500438', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40439052/', 'doi_url': 'https://doi.org/10.1002/asia.202500438'}

hemilabile ligand-protected Au nanoclusters hold greater promise in realizing a
better trade-off between catalytic activity and structural stability.

Step 1 - Node 14099:
Content: As the new type of molecular catalysts, hemilabile ligand-protected Au nanoclusters hold greater pro...
Concepts: 
-------------------------------------------

2025-06-26 10:22:50,211 - INFO - Semantic match found with score 0.0978.
2025-06-26 10:22:50,215 - INFO - Final Answer: hemilabile ligand-protected Au nanoclusters hold greater promise in realizing a better trade-off between catalytic activity and structural stability....
2025-06-26 10:22:50,216 - INFO - Starting query for: 'What is the significance of achieving a high AUC value, such as 0.937, in the context of medical diagnosis, particularly for thyroid nodule diagnosis using deep learning frameworks, and how does the use of hemilabile ligand-protected Au nanoclusters, like the newly synthesized [Au20(dpim)6(dppe)2Cl2]4+, contribute to advancements in materials science and potentially to medical applications?'
2025-06-26 10:22:51,688 - INFO - Semantic match found with score 0.1844.
2025-06-26 10:22:51,692 - INFO - Chunk analysis - Count: 1, Avg length: 18.0 words, Range: 18-18 words
2025-06-26 10:22:51,768 - INFO - Priority queue exhausted without finding a complete answer.
2025-06-2

Document 1:
Metadata: {'id': 0, 'relevance_score': np.float32(0.00018767534), 'pmid': '40439052', 'title': 'Imidazolyl Phosphine as Versatile Hemilabile Ligand: Synthesis and Characterization of a New Tetracationic Au(20) Nanocluster.', 'authors': 'Liu WY; Li YZ; Yu JH; Yuan ZR; Yang PF; Azam M; Sun D', 'journal': 'Chemistry, an Asian journal', 'volume': '', 'issue': '', 'year': '2025', 'month': 'May', 'day': '29', 'pub_date': '2025 May 29', 'doi': '10.1002/asia.202500438', 'pmc_id': '', 'mesh_terms': '', 'publication_types': 'Journal Article', 'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/40439052/', 'doi_url': 'https://doi.org/10.1002/asia.202500438'}

hemilabile ligand-protected Au nanoclusters hold greater promise in realizing a
better trade-off between catalytic activity and structural stability.

Step 1 - Node 14099:
Content: As the new type of molecular catalysts, hemilabile ligand-protected Au nanoclusters hold greater pro...
Concepts: 
----------------------------------------

2025-06-26 10:22:52,086 - INFO - Semantic match found with score 0.1798.
2025-06-26 10:22:52,089 - INFO - Final Answer: hemilabile ligand-protected Au nanoclusters hold greater promise in realizing a better trade-off between catalytic activity and structural stability....


In [54]:
from datasets import Dataset as HFDataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, answer_correctness

# Convert AIMessage to strings
results_df['answer'] = results_df['answer'].apply(lambda x: x.content if hasattr(x, 'content') else str(x))

# Prepare evaluation dataset from DataFrame
eval_dataset = HFDataset.from_pandas(results_df[['user_input', 'reference_contexts', 'answer', 'reference']])
eval_dataset = eval_dataset.rename_columns({
    'user_input': 'question',
    'reference_contexts': 'contexts',
    'reference': 'ground_truth'
})

# Ensure contexts is a list of strings
eval_dataset = eval_dataset.map(
    lambda x: {
        'contexts': [x['contexts']] if isinstance(x['contexts'], str) else x['contexts']
    }
)

# Evaluate
metrics = [faithfulness, answer_relevancy, context_precision, answer_correctness]
results = evaluate(
    dataset=eval_dataset,
    metrics=metrics,
    llm=generator_llm,
    embeddings=generator_embeddings
)

# Print results
print(results)

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Evaluating:   0%|          | 0/124 [00:00<?, ?it/s]

2025-06-26 10:28:13,600 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:13,659 - INFO - Best match score 0.9835 above threshold (0.5).
2025-06-26 10:28:13,659 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:13,549 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:13,659 - INFO - Best match score 1.0113 above threshold (0.5).
2025-06-26 10:28:13,660 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:13,660 - INFO - Best match score 1.0113 above threshold (0.5).
2025-06-26 10:28:13,660 - INFO - Best match score 0.9835 above threshold (0.5).
2025-06-26 10:28:15,131 - INFO - Best match score 1.0113 above threshold (0.5).
2025-06-26 10:28:15,132 - INFO - Best match score 1.0113 above threshold (0.5).
2025-06-26 10:28:15,158 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:15,190 - INFO - Best match score 1.0489 above threshold (0.5).
2025-06-26 10:28:15,161 - INFO - Best ma

{'faithfulness': nan, 'answer_relevancy': 0.0184, 'context_precision': 1.0000, 'answer_correctness': nan}


In [49]:
import ragas
print(ragas.__version__)

0.2.15
