In [None]:
import ast
import os
import warnings
from collections.abc import Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal

import dspy
import nest_asyncio
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from dspy import LM, configure
from fastembed import TextEmbedding
from fastembed.rerank.cross_encoder import TextCrossEncoder
from langchain import hub
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
from langchain.retrievers.document_compressors import (
    DocumentCompressorPipeline,
    EmbeddingsFilter,
)
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_core.callbacks import Callbacks
from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.embeddings import Embeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_deepseek import ChatDeepSeek
from langchain_experimental.text_splitter import SemanticChunker
from loguru import logger
from pydantic import ConfigDict, Field
from ragas import evaluate
from ragas.dataset_schema import EvaluationDataset
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    AnswerSimilarity,
    ContextEntityRecall,
    ContextPrecision,
    ContextRecall,
    Faithfulness,
    NoiseSensitivity,
)
from ragas.run_config import RunConfig
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import (
    MultiHopAbstractQuerySynthesizer,
    SingleHopSpecificQuerySynthesizer,
)

warnings.filterwarnings("ignore")
nest_asyncio.apply()

In [None]:
# from pubmed_scraper import PubMedScraper

# scraper = PubMedScraper(email = "olandechris@gmail.com")

# data = scraper.search_with_llm(query = "Find me 50 papers about Covid 19 from 2019 to 2025")

In [None]:
_ = load_dotenv()
llm = ChatDeepSeek(model="deepseek-chat", temperature=1.3)
# llm = ChatOpenAI(model="openrouter/sonoma-dusk-alpha")
dspy_lm = LM(
    "deepseek/deepseek-chat",
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com",
)

# dspy_lm = LM(
#     "openrouter/openrouter/sonoma-dusk-alpha",
#     api_key=os.getenv("OPENAI_API_KEY"),
#     base_url=os.getenv("OPENAI_BASE_URL"),
# )
configure(lm=dspy_lm)
dspy.settings.configure(track_usage=True)
# scraper = PubMedScraper(email = "olandechris@gmail.com")

In [None]:
# df = scraper.search_with_llm(query = "Find papers about the impact of Gaza war on children")

In [None]:
df = pd.read_csv("data/gaza_war_impact_children.csv")

In [None]:
df.head()

Unnamed: 0,Pmid,Title,Abstract,Authors,Journal,Keywords,Url,Affiliations,Publication Date,References
0,40678639,Polio vaccination campaigns in conflicts: succ...,"In conflict settings, public health interventi...","Sabahelzain Majdi M, Agha Hazem, Davidovitch N...",Frontiers in public health,"Humans, Poliomyelitis, Israel, Immunization Pr...",https://www.ncbi.nlm.nih.gov/pubmed/40678639,"Sydney School of Public Health, The University...",2025,"Schwartzstein P. The Rise, Fall, and Possible ..."
1,40475386,Trauma by the Numbers: A Cross-Sectional Analy...,To categorize and analyze trauma cases from th...,"Wajahath Muaaz, Nasser Elias, Nayfeh Tariq, Ir...",International journal of public health,"Humans, Male, Cross-Sectional Studies, Adult, ...",https://www.ncbi.nlm.nih.gov/pubmed/40475386,Michigan State University College of Human Med...,2025,"Magruder KM, McLaughlin KA, Elmore Borbon DL. ..."
2,39957103,"Energy drinks, depression, insomnia, and stres...",Adolescents are increasingly consuming energy ...,"Maraqa Beesan, Fasfoos Ahmad, Alami Mohammad, ...",International journal of adolescent medicine a...,"Humans, Adolescent, Male, Sleep Initiation and...",https://www.ncbi.nlm.nih.gov/pubmed/39957103,"College of Medicine, 115527 Hebron University ...",2025-Feb-01,Statisa . Revenue of the energy & sports drink...
3,39258854,European Academy of Paediatrics demands protec...,,"Koletzko Berthold, da Dalt Liviana, De Guchten...","Acta paediatrica (Oslo, Norway : 1992)",,https://www.ncbi.nlm.nih.gov/pubmed/39258854,"Department of Paediatrics, LMU University of M...",2024-Dec,
4,37497596,Post-traumatic stress in war veterans and seco...,Secondary traumatic stress (STS) has been stud...,"Leshem Shahaf, Keha Eldad, Kalanthroff Eyal",European journal of psychotraumatology,"Child, Female, Humans, Veterans, Compassion Fa...",https://www.ncbi.nlm.nih.gov/pubmed/37497596,"Department of Psychology, The Hebrew Universit...",2023,American Psychiatric Association . (2013). Dia...


In [None]:
df["Article"] = df["Title"].str.cat(df["Abstract"])
df.drop(columns=["Abstract"], inplace=True)

In [None]:
df.to_csv("data/tests.csv", index=False)
df.head()

Unnamed: 0,Pmid,Title,Authors,Journal,Keywords,Url,Affiliations,Publication Date,References,Article
0,40678639,Polio vaccination campaigns in conflicts: succ...,"Sabahelzain Majdi M, Agha Hazem, Davidovitch N...",Frontiers in public health,"Humans, Poliomyelitis, Israel, Immunization Pr...",https://www.ncbi.nlm.nih.gov/pubmed/40678639,"Sydney School of Public Health, The University...",2025,"Schwartzstein P. The Rise, Fall, and Possible ...",Polio vaccination campaigns in conflicts: succ...
1,40475386,Trauma by the Numbers: A Cross-Sectional Analy...,"Wajahath Muaaz, Nasser Elias, Nayfeh Tariq, Ir...",International journal of public health,"Humans, Male, Cross-Sectional Studies, Adult, ...",https://www.ncbi.nlm.nih.gov/pubmed/40475386,Michigan State University College of Human Med...,2025,"Magruder KM, McLaughlin KA, Elmore Borbon DL. ...",Trauma by the Numbers: A Cross-Sectional Analy...
2,39957103,"Energy drinks, depression, insomnia, and stres...","Maraqa Beesan, Fasfoos Ahmad, Alami Mohammad, ...",International journal of adolescent medicine a...,"Humans, Adolescent, Male, Sleep Initiation and...",https://www.ncbi.nlm.nih.gov/pubmed/39957103,"College of Medicine, 115527 Hebron University ...",2025-Feb-01,Statisa . Revenue of the energy & sports drink...,"Energy drinks, depression, insomnia, and stres..."
3,39258854,European Academy of Paediatrics demands protec...,"Koletzko Berthold, da Dalt Liviana, De Guchten...","Acta paediatrica (Oslo, Norway : 1992)",,https://www.ncbi.nlm.nih.gov/pubmed/39258854,"Department of Paediatrics, LMU University of M...",2024-Dec,,
4,37497596,Post-traumatic stress in war veterans and seco...,"Leshem Shahaf, Keha Eldad, Kalanthroff Eyal",European journal of psychotraumatology,"Child, Female, Humans, Veterans, Compassion Fa...",https://www.ncbi.nlm.nih.gov/pubmed/37497596,"Department of Psychology, The Hebrew Universit...",2023,American Psychiatric Association . (2013). Dia...,Post-traumatic stress in war veterans and seco...


In [None]:
@dataclass
class FastEmbed(Embeddings):
    fe: TextEmbedding = field(default_factory=TextEmbedding)

    def embed_documents(self, texts: list[str]):
        return [emb.tolist() for emb in self.fe.embed(texts)]

    def embed_query(self, text: str):
        return list(self.fe.embed([text]))[0].tolist()

In [None]:
embeddings = FastEmbed(
    TextEmbedding(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        cache_dir=os.path.expanduser("~/.cache/fastembed"),
    )
)

splitter = SemanticChunker(embeddings)

loader = CSVLoader(
    file_path="data/tests.csv",
    source_column="Pmid",
    content_columns=["Article"],
    metadata_columns=[
        "Pmid",
        "Title",
        "Url",
        "Authors",
        "Keywords",
        "Journal",
        "Publication Date",
        "References",
    ],
)
documents = loader.load()

splitted_documents = splitter.split_documents(documents)

# Develop a Gold Dataset for RAG Evaluation

In [None]:
evaluator_llm = LangchainLLMWrapper(ChatDeepSeek(model="deepseek-chat"))
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)

# Create TestsetGenerator using the wrapped embeddings and the evaluator LLM
generator = TestsetGenerator(llm=evaluator_llm, embedding_model=wrapped_embeddings)

In [None]:
query_distribution = [
    (MultiHopAbstractQuerySynthesizer(llm=evaluator_llm), 0.5),
    (SingleHopSpecificQuerySynthesizer(llm=evaluator_llm), 0.5),
]

In [None]:
dataset = generator.generate_with_langchain_docs(
    splitted_documents,
    testset_size=10,  # Generate a larger dataset so we can have more of the test set too
    query_distribution=query_distribution,
)

Applying SummaryExtractor:   0%|          | 0/18 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/24 [00:00<?, ?it/s]

Node b8cbd378-733a-45d9-b139-471242db0c60 does not have a summary. Skipping filtering.
Node 34a94fb9-3a1f-408e-ac91-9db0ce074192 does not have a summary. Skipping filtering.
Node 8699f979-cec9-44d2-b569-2619c98267ef does not have a summary. Skipping filtering.
Node fd5bd1ff-ff32-4f91-b058-5b74ce6e4b2e does not have a summary. Skipping filtering.
Node b734e0b7-319f-42b0-9c31-3398d15cfb9d does not have a summary. Skipping filtering.
Node 757cdbf4-bc68-425d-9d3c-fa0c95fe3484 does not have a summary. Skipping filtering.


Applying EmbeddingExtractor:   0%|          | 0/18 [00:00<?, ?it/s]

Applying ThemesExtractor:   0%|          | 0/24 [00:00<?, ?it/s]

Applying NERExtractor:   0%|          | 0/24 [00:00<?, ?it/s]

Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
os.makedirs("RAGEvaluation", exist_ok=True)
df = dataset.to_pandas()
output_csv_path = os.path.join("RAGEvaluation", "generated_testset.csv")
df.to_csv(output_csv_path, index=False)
print(f"Generated testset saved to {output_csv_path}")

Generated testset saved to RAGEvaluation/generated_testset.csv


# Develop a RAG

In [None]:
def batch_process(
    documents: list[Document],
    embeddings: FastEmbed,
    persist_directory: str = "faiss_index",
    batch_size: int = 10,
    force_rebuild: bool = False,
):
    # Check if the persist_directory exists, else create one
    os.makedirs(persist_directory, exist_ok=True)
    index_path = os.path.join(persist_directory, "index.faiss")

    if not force_rebuild and os.path.exists(index_path):
        vector_index = FAISS.load_local(
            persist_directory, embeddings, allow_dangerous_deserialization=True
        )

    else:
        print(f"Creating new FAISS index at {persist_directory}")
        # Create them batches
        batched_docs = [
            documents[i : i + batch_size] for i in range(0, len(documents), batch_size)
        ]
        vector_index = FAISS.from_documents(batched_docs[0], embeddings)

        for batch in batched_docs[1:]:
            vector_index.add_documents(batch)

        # Persist the vector index
        vector_index.save_local(persist_directory)

    return vector_index

In [None]:
@dataclass
class RetrieverConfig:
    k: int = 15
    sparse_weight: float = 0.65
    dense_weight: float = 0.35
    similarity_threshold: float = 0.6
    redundancy_threshold: float = 0.95
    top_n: int = 5
    reranker_model: str = "ms-marco-MiniLM-L-12-v2"
    reranker_cache_dir: str = "~/.cache/flashrank"

In [None]:
class FastEmbedRerank(BaseDocumentCompressor):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    model_name: str = Field(
        default="Xenova/ms-marco-miniLM-L-6-v2", description="Cross-encoder model name"
    )
    cache_dir: str = Field(
        default="~/.cache/fastembed", description="Cache directory for models"
    )
    top_n: int = Field(default=5, description="Number of top documents to return")
    encoder: TextCrossEncoder = Field(
        default=None, description="Cross encoder instance"
    )

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        if self.encoder is None:
            self.encoder = TextCrossEncoder(
                model_name=self.model_name, cache_dir=os.path.expanduser(self.cache_dir)
            )

    def _convert_to_documents(self, docs):
        """
        Converts a list of documents to a uniform format.

        This method processes a list of documents and converts _DocumentswithState objects
        to a standardized format if they have a specific attribute. Documents
        with the "state" attribute are converted into `Document` objects,
        whereas others are appended to the output list unchanged.

        Parameters:
        docs: list
            A list of document-like objects which may or may not need to be
            converted to a standard format.

        Returns:
        list
            A list of documents, with some converted to the `Document` type if
            the "state" attribute was present.
        """
        converted_docs = []
        for doc in docs:
            if hasattr(doc, "state"):
                doc = Document(page_content=doc.page_content, metadata=doc.metadata)
                converted_docs.append(doc)
            else:
                converted_docs.append(doc)  # A regular document object

        return converted_docs

    def _normalize_scores(self, scores):
        """
        Normalize a list of scores using the sigmoid function.

        The method applies the sigmoid function to each score in a list, which
        maps the input scores to probabilities in the range [0, 1]. This is useful
        for tasks requiring normalized probabilities such as classification scores.

        Args:
            scores: A list of numerical values representing the raw scores to be
                normalized.

        Returns:
            A list of float values representing the normalized probabilities after
            applying the sigmoid function.
        """
        probs = 1 / (1 + np.exp(-np.array(scores)))  # Sigmoid
        return probs.tolist()

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Callbacks | None = None,
    ):
        """
        Reranks and compresses a set of provided documents based on relevance to a query.

        This method processes the given documents by first ensuring that only those with non-empty
        page content are considered valid. It then calculates relevance scores for the documents
        with respect to the query, normalizes these scores, and attaches them to the documents.
        The documents are finally sorted by their relevance scores in descending order, and the
        top-ranked ones are returned.

        Parameters:
            documents: Sequence[Document]
                A sequence of documents to be reranked and compressed.
            query: str
                A query string used to determine the relevance of the documents.
            callbacks: Optional[Callbacks]
                A set of callbacks that may be executed during processing.

        Returns:
            A list of reranked documents sorted by relevance score in descending order.

        Raises:
            This method handles and logs any exceptions raised during processing
            but does not explicitly propagate them.
        """
        if not documents:
            logger.error("No documents provided for reranking step")
            return []

        documents = self._convert_to_documents(documents)
        logger.debug(f"Reranking {len(documents)} documents")

        valid_docs = [doc for doc in documents if doc.page_content.strip()]
        if not valid_docs:
            logger.error("The documents provided have no page contents!")
            return []

        try:
            doc_texts = [
                doc.page_content
                for doc in valid_docs
                if doc.page_content and doc.page_content.strip()
            ]
            scores = list(self.encoder.rerank(query, doc_texts))

            norm_scores = self._normalize_scores(scores)

            # Attach scores to documents and sort
            scored_docs = []
            for doc, score in zip(valid_docs, norm_scores):
                doc.metadata["relevance_score"] = float(score)
                scored_docs.append(doc)

            # Sort the reranked documents and return the top_n
            reranked_docs = sorted(
                scored_docs, key=lambda d: d.metadata["relevance_score"], reverse=True
            )
            return reranked_docs[: self.top_n]

        except Exception as e:
            logger.error(f"An error occurred during Reranking: {str(e)}")
            return []

In [None]:
def build_retriever(
    splitted_documents: list[Document],
    embeddings: Embeddings,
    config: RetrieverConfig = None,
) -> ContextualCompressionRetriever:
    if config is None:
        config = RetrieverConfig()

    if not splitted_documents:
        raise ValueError("No documents are passed")

    try:
        # Dense + Sparse retrieval
        vector_store = batch_process(splitted_documents, embeddings)
        dense_retriever = vector_store.as_retriever(search_kwargs={"k": config.k})
        sparse_retriever = BM25Retriever.from_documents(
            splitted_documents, k=config.k
        )  # TODO: Develop a gold dataset to test the effect of weighting
        ensemble_retriever = EnsembleRetriever(
            retrievers=[sparse_retriever, dense_retriever],
            weights=[config.sparse_weight, config.dense_weight],
        )  # Lean more on sparse retrieval, embeddings are not trained on medical data

        # Develop a compression pipeline

        pipeline_compressor = DocumentCompressorPipeline(
            transformers=[
                EmbeddingsFilter(
                    embeddings=embeddings,
                    similarity_threshold=config.similarity_threshold,
                ),
                EmbeddingsRedundantFilter(
                    embeddings=embeddings,
                    similarity_threshold=config.redundancy_threshold,
                ),
                FastEmbedRerank(),
            ]
        )

        return ContextualCompressionRetriever(
            base_compressor=pipeline_compressor, base_retriever=ensemble_retriever
        )
    except Exception as e:
        logger.error(f"An error occurred during retrieval: {str(e)}")
        # Fallback to simple dense retriever
        vector_store = batch_process(splitted_documents, embeddings)
        return vector_store.as_retriever(search_kwargs={"k": config.k})

In [None]:
retriever = build_retriever(splitted_documents, embeddings)

# Make RAG Generate output

In [None]:
@dataclass
class RAGOutput:
    prompt_name: str
    retriever: Any

    llm_model: Literal["deepseek-chat", "deepseek-reasoner"]
    question: str | None = None
    docs: list["Document"] | None = None

    _prompt_template: Any = field(default=None)
    _llm_instance: Any = field(default=None)
    _retrieved_contexts_list: list[list[str]] = field(default_factory=list)
    _chain: Any = field(default=None)

    def __post_init__(self):
        try:
            self._prompt_template = hub.pull(self.prompt_name)
            self._llm_instance = ChatDeepSeek(model=self.llm_model)
        except Exception as e:
            raise RuntimeError(f"Failed to initialize the RAG: {e}")

    def _format_docs(self, docs: list["Document"]):
        if not docs:
            return ""

        if not isinstance(docs, list):
            raise ValueError(f"Expected list, got {type(docs)}")

        if not all(isinstance(doc, Document) for doc in docs):
            raise ValueError("All items must be Document instances")

        return "\n\n".join(doc.page_content for doc in docs)

    def capture_retrieved_contexts(self, state: dict[str, Any]):
        retrieved_docs = state.get("context", [])
        if not retrieved_docs:
            self._retrieved_contexts_list.append([])
            return state

        if not isinstance(retrieved_docs, list):
            retrieved_docs = [retrieved_docs]

        # Extract content from the list
        retrieved_contexts = [self._extract_content(doc) for doc in retrieved_docs]
        self._retrieved_contexts_list.append(retrieved_contexts)
        return state

    @property
    def retrieved_contexts_list(self) -> list[list[str]]:
        """Get the list of all retrieved contexts."""
        return self._retrieved_contexts_list

    @property
    def prompt_template(self):
        return self._prompt_template

    @property
    def llm_instance(self):
        return self._llm_instance

    def _extract_content(self, doc):
        if isinstance(doc, str):
            return doc
        elif isinstance(doc, Document):
            return doc.page_content
        elif hasattr(doc, "page_content"):
            return doc.page_content
        else:
            return str(doc)

    def create_chain(self):
        self._chain = (
            {
                "context": self.retriever | self._format_docs,
                "question": RunnablePassthrough(),
            }
            | RunnableLambda(self.capture_retrieved_contexts)
            | self._prompt_template
            | self._llm_instance
            | StrOutputParser()
        )
        return self._chain

    def invoke(self, question: str):
        if self._chain is None:
            self.create_chain()
        return self._chain.invoke(question)

In [None]:
def evaluate_rag_pipeline(
    rag_chain: RAGOutput,
    input_csv_path: str,
    output_csv_path: str,
    question_column: str = "user_input",
) -> pd.DataFrame:
    input_path = Path(input_csv_path)
    if not input_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_csv_path}")

    # Create the output directory if it doesn't exist
    output_path = Path(output_csv_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Load the dataset
    print(f"Loading the dataset from {input_csv_path}")
    data = pd.read_csv(input_csv_path)

    if question_column not in data.columns:
        raise ValueError(f"Column '{question_column}' not found in the dataset")

    # Process each question
    responses = []
    print(f"Processing {len(data)} questions ...")

    for idx, row in data.iterrows():
        user_input = row[question_column]

        try:
            response = rag_chain.invoke(user_input)
            responses.append(response)

            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1} / {len(data)} questions")
        except Exception as e:
            print(f"Error Processing question {idx}: {e}")
            responses.append(f"ERROR: {str(e)}")

    # Add results to the dataframe
    data["response"] = responses
    data["retrieved_contexts"] = rag_chain.retrieved_contexts_list

    # Save the resutls
    data.to_csv(output_csv_path, index=False)
    print(f"Saved results to {output_csv_path}")
    return data

In [None]:
rag_chain = RAGOutput(
    prompt_name="rlm/rag-prompt", retriever=retriever, llm_model="deepseek-chat"
)

rag_chain.create_chain()
results = evaluate_rag_pipeline(
    rag_chain=rag_chain,
    input_csv_path="RAGEvaluation/generated_testset.csv",
    output_csv_path="RAGEvaluation/results_deepseek_fastembed.csv",
    question_column="user_input",
)

Loading the dataset from RAGEvaluation/generated_testset.csv
Processing 10 questions ...


[32m2025-10-07 07:38:35.940[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 6 documents[0m
[32m2025-10-07 07:38:49.095[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 12 documents[0m
[32m2025-10-07 07:39:07.158[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 11 documents[0m
[32m2025-10-07 07:39:20.684[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 4 documents[0m
[32m2025-10-07 07:39:28.484[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 11 documents[0m
[32m2025-10-07 07:39:44.286[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcompress_documents[0m:[36m105[0m - [34m[1mReranking 1 documents[0m
[32m2025-10-07 07:39:51.409[0m | [34m[1mDEBUG   [0m | [36m__m

Processed 10 / 10 questions
Saved results to RAGEvaluation/results_deepseek_fastembed.csv


# Evaluate the RAG

In [None]:
evaluation_llm = LangchainLLMWrapper(llm)
evaluation_embeddings = LangchainEmbeddingsWrapper(embeddings)

In [None]:
@dataclass
class RAGEvaluator:
    """RAG Evaluation pipeline for model-embedding pairs."""

    max_workers: int = 1
    timeout: int = 180
    generative_models: list[str] = field(default_factory=lambda: ["deepseek-chat"])
    embedding_models: list[str] = field(default_factory=lambda: ["fastembed"])
    metrics: list = field(
        default_factory=lambda: [
            ContextRecall(),
            ContextPrecision(),
            AnswerSimilarity(),
            ContextEntityRecall(),
            NoiseSensitivity(),
            Faithfulness(),
        ]
    )

    def __post_init__(self):
        """Initialize RunConfig after dataclass initialization."""
        self.run_config = RunConfig(max_workers=self.max_workers, timeout=self.timeout)

    def parse_contexts(self, data: pd.DataFrame) -> pd.DataFrame:
        """Parse retrieved_contexts from string to list."""
        if "retrieved_contexts" in data.columns:
            data["retrieved_contexts"] = data["retrieved_contexts"].apply(
                ast.literal_eval
            )
        return data

    def prepare_dataset(self, data: pd.DataFrame) -> EvaluationDataset:
        """Prepare evaluation dataset from dataframe."""
        eval_data = data[
            ["user_input", "reference", "response", "retrieved_contexts"]
        ].to_dict(orient="records")
        return EvaluationDataset.from_list(eval_data)

    def run_evaluation(
        self, input_csv_path: str, evaluation_embeddings
    ) -> pd.DataFrame:
        """
        Run evaluation on input data.

        Args:
            input_csv_path: Path to input CSV file
            evaluation_embeddings: Embeddings to use for evaluation

        Returns:
            DataFrame with evaluation results
        """
        data = pd.read_csv(input_csv_path)
        data = self.parse_contexts(data)
        eval_dataset = self.prepare_dataset(data)

        evaluator_llm = LangchainLLMWrapper(ChatDeepSeek(model="deepseek-chat"))

        results = evaluate(
            dataset=eval_dataset,
            metrics=self.metrics,
            llm=evaluator_llm,
            embeddings=evaluation_embeddings,
            run_config=self.run_config,
        )

        return results.to_pandas()

    def evaluate_all_models(self, evaluation_embeddings):
        """
        Evaluate all model-embedding pairs.

        Args:
            evaluation_embeddings: Embeddings to use for evaluation
        """
        for model, embedding in zip(self.generative_models, self.embedding_models):
            model_pair = f"{model}_{embedding}"
            output_csv_path = f"RAGEvaluation/evaluation_results_{model_pair}.csv"

            if os.path.exists(output_csv_path):
                print(f"Loading existing results for {model_pair}")
                df = pd.read_csv(output_csv_path)
            else:
                print(f"Running evaluation for {model_pair}")
                input_csv_path = "RAGEvaluation/results_deepseek_fastembed.csv"

                df = self.run_evaluation(input_csv_path, evaluation_embeddings)
                df.to_csv(output_csv_path, index=False)

Loading existing results for deepseek-chat_fastembed


In [None]:
evaluator = RAGEvaluator(max_workers=1, timeout=180)
evaluator.evaluate_all_models(evaluation_embeddings)

In [None]:
pd.read_csv("RAGEvaluation/evaluation_results_deepseek-chat_fastembed.csv")

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,context_precision,answer_similarity,context_entity_recall,noise_sensitivity(mode=relevant),faithfulness
0,"How does a family systems approach, as identif...","[""Article: Family systems approach to attachme...","A family systems approach, as identified in re...",A family systems approach identified distinct ...,0.5,0.0,0.927313,0.0,0.166667,0.285714
1,How do traumatic stress symptoms from conflict...,"[""Article: Quality of life, primary traumatisa...",Traumatic stress symptoms from conflict and ma...,"Traumatic stress symptoms during conflict, as ...",1.0,1.0,0.925362,0.0,0.0,0.727273
2,How does prenatal exposure to war trauma in co...,"[""Article: Post-traumatic stress in war vetera...","Based on the provided context, prenatal exposu...","Prenatal exposure to traumatic war events, suc...",0.666667,1.0,0.924171,0.222222,0.571429,0.571429
3,How does prenatal exposure to war trauma in co...,"[""Article: War trauma and infant motor, cognit...",Prenatal exposure to war trauma in Gaza is ass...,"Prenatal exposure to traumatic war events, suc...",0.666667,1.0,0.904264,0.4375,0.0,0.714286
4,How does war trauma from the 2014 Gaza War aff...,"[""Article: War trauma and infant motor, cognit...",War trauma from the 2014 Gaza War negatively a...,Mothers' prenatal exposure to traumatic war ev...,0.666667,0.0,0.908345,0.5,0.333333,1.0
5,"As a Humanitarian Aid Coordinator, can you exp...",['Article: Polio vaccination campaigns in conf...,Polio vaccination campaigns are often successf...,Polio vaccination campaigns frequently succeed...,1.0,1.0,0.836205,0.333333,0.333333,0.888889
6,As a public health researcher focusing on subs...,"[""Article: Trajectories of traumatic stress sy...",The key finding from the trauma case analysis ...,A cross-sectional study was conducted in April...,1.0,1.0,0.914962,0.055556,0.111111,1.0
7,What is the ISI used to measure in the study o...,[],"In the study on Palestinian adolescents, the I...",The ISI is used to measure insomnia in the stu...,0.0,0.0,0.934945,0.0,,0.0
8,As a Humanitarian Aid Coordinator working in c...,"[""Interviewees revealed that their perceptions...","Based on the available data, the Gaza War had ...","The study was conducted during the Gaza war, a...",0.0,1.0,0.694437,0.4,0.4,1.0
9,What is STS in the context of war veterans' fa...,[],"STS stands for Secondary Traumatic Stress, whi...",Secondary traumatic stress (STS) is a conditio...,0.0,0.0,0.864619,0.0,,0.0


In [None]:
df = pd.read_csv("RAGEvaluation/results_deepseek_fastembed.csv")

["<1-hop>\n\nArticle: Family systems approach to attachment relations, war trauma, and mental health among Palestinian children and parents.<b>Background</b>: Trauma affects the family unit as a whole; however, most existing research uses individual or, at most, dyadic approaches to analyse families with histories of trauma. <b>Objective</b>: This study aims to identify potentially distinct family types according to attachment, parenting, and sibling relations, to analyse how these family types differ with respect to war trauma, and to explore how children's mental health and cognitive processing differ across these family types. <b>Method:</b> Participants included Palestinian mothers and fathers (<i>N</i>\xa0=\xa0325) and their children (one per family; 49.4% girls; 10-13\xa0years old; mean\xa0±\xa0<i>SD</i> age\xa0=\xa011.35 ± 0.57 years) after the Gaza War of 2008-2009. Both parents reported their exposure to war trauma, secure attachment availability, and parenting practices, as w