In [None]:
import os
import warnings
from dataclasses import dataclass, field

import dspy
import nest_asyncio
import pandas as pd
from dotenv import load_dotenv
from dspy import LM, configure
from fastembed import TextEmbedding
from langchain_community.document_loaders import CSVLoader
from langchain_core.embeddings import Embeddings
from langchain_deepseek import ChatDeepSeek
from langchain_experimental.text_splitter import SemanticChunker
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import (
    MultiHopAbstractQuerySynthesizer,
    SingleHopSpecificQuerySynthesizer,
)

warnings.filterwarnings("ignore")
nest_asyncio.apply()

In [None]:
# from pubmed_scraper import PubMedScraper

# scraper = PubMedScraper(email = "olandechris@gmail.com")

# data = scraper.search_with_llm(query = "Find me 50 papers about Covid 19 from 2019 to 2025")

In [None]:
_ = load_dotenv()
llm = ChatDeepSeek(model="deepseek-chat", temperature=1.3)
# llm = ChatOpenAI(model="openrouter/sonoma-dusk-alpha")
dspy_lm = LM(
    "deepseek/deepseek-chat",
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com",
)

# dspy_lm = LM(
#     "openrouter/openrouter/sonoma-dusk-alpha",
#     api_key=os.getenv("OPENAI_API_KEY"),
#     base_url=os.getenv("OPENAI_BASE_URL"),
# )
configure(lm=dspy_lm)
dspy.settings.configure(track_usage=True)
# scraper = PubMedScraper(email = "olandechris@gmail.com")

In [None]:
# df = scraper.search_with_llm(query = "Find papers about the impact of Gaza war on children")

In [None]:
df = pd.read_csv("data/gaza_war_impact_children.csv")

In [None]:
df.head()

Unnamed: 0,Pmid,Title,Abstract,Authors,Journal,Keywords,Url,Affiliations,Publication Date,References
0,40678639,Polio vaccination campaigns in conflicts: succ...,"In conflict settings, public health interventi...","Sabahelzain Majdi M, Agha Hazem, Davidovitch N...",Frontiers in public health,"Humans, Poliomyelitis, Israel, Immunization Pr...",https://www.ncbi.nlm.nih.gov/pubmed/40678639,"Sydney School of Public Health, The University...",2025,"Schwartzstein P. The Rise, Fall, and Possible ..."
1,40475386,Trauma by the Numbers: A Cross-Sectional Analy...,To categorize and analyze trauma cases from th...,"Wajahath Muaaz, Nasser Elias, Nayfeh Tariq, Ir...",International journal of public health,"Humans, Male, Cross-Sectional Studies, Adult, ...",https://www.ncbi.nlm.nih.gov/pubmed/40475386,Michigan State University College of Human Med...,2025,"Magruder KM, McLaughlin KA, Elmore Borbon DL. ..."
2,39957103,"Energy drinks, depression, insomnia, and stres...",Adolescents are increasingly consuming energy ...,"Maraqa Beesan, Fasfoos Ahmad, Alami Mohammad, ...",International journal of adolescent medicine a...,"Humans, Adolescent, Male, Sleep Initiation and...",https://www.ncbi.nlm.nih.gov/pubmed/39957103,"College of Medicine, 115527 Hebron University ...",2025-Feb-01,Statisa . Revenue of the energy & sports drink...
3,39258854,European Academy of Paediatrics demands protec...,,"Koletzko Berthold, da Dalt Liviana, De Guchten...","Acta paediatrica (Oslo, Norway : 1992)",,https://www.ncbi.nlm.nih.gov/pubmed/39258854,"Department of Paediatrics, LMU University of M...",2024-Dec,
4,37497596,Post-traumatic stress in war veterans and seco...,Secondary traumatic stress (STS) has been stud...,"Leshem Shahaf, Keha Eldad, Kalanthroff Eyal",European journal of psychotraumatology,"Child, Female, Humans, Veterans, Compassion Fa...",https://www.ncbi.nlm.nih.gov/pubmed/37497596,"Department of Psychology, The Hebrew Universit...",2023,American Psychiatric Association . (2013). Dia...


In [None]:
df["Article"] = df["Title"].str.cat(df["Abstract"])
df.drop(columns=["Abstract"], inplace=True)

In [None]:
df.to_csv("data/tests.csv", index=False)
df.head()

Unnamed: 0,Pmid,Title,Authors,Journal,Keywords,Url,Affiliations,Publication Date,References,Article
0,40678639,Polio vaccination campaigns in conflicts: succ...,"Sabahelzain Majdi M, Agha Hazem, Davidovitch N...",Frontiers in public health,"Humans, Poliomyelitis, Israel, Immunization Pr...",https://www.ncbi.nlm.nih.gov/pubmed/40678639,"Sydney School of Public Health, The University...",2025,"Schwartzstein P. The Rise, Fall, and Possible ...",Polio vaccination campaigns in conflicts: succ...
1,40475386,Trauma by the Numbers: A Cross-Sectional Analy...,"Wajahath Muaaz, Nasser Elias, Nayfeh Tariq, Ir...",International journal of public health,"Humans, Male, Cross-Sectional Studies, Adult, ...",https://www.ncbi.nlm.nih.gov/pubmed/40475386,Michigan State University College of Human Med...,2025,"Magruder KM, McLaughlin KA, Elmore Borbon DL. ...",Trauma by the Numbers: A Cross-Sectional Analy...
2,39957103,"Energy drinks, depression, insomnia, and stres...","Maraqa Beesan, Fasfoos Ahmad, Alami Mohammad, ...",International journal of adolescent medicine a...,"Humans, Adolescent, Male, Sleep Initiation and...",https://www.ncbi.nlm.nih.gov/pubmed/39957103,"College of Medicine, 115527 Hebron University ...",2025-Feb-01,Statisa . Revenue of the energy & sports drink...,"Energy drinks, depression, insomnia, and stres..."
3,39258854,European Academy of Paediatrics demands protec...,"Koletzko Berthold, da Dalt Liviana, De Guchten...","Acta paediatrica (Oslo, Norway : 1992)",,https://www.ncbi.nlm.nih.gov/pubmed/39258854,"Department of Paediatrics, LMU University of M...",2024-Dec,,
4,37497596,Post-traumatic stress in war veterans and seco...,"Leshem Shahaf, Keha Eldad, Kalanthroff Eyal",European journal of psychotraumatology,"Child, Female, Humans, Veterans, Compassion Fa...",https://www.ncbi.nlm.nih.gov/pubmed/37497596,"Department of Psychology, The Hebrew Universit...",2023,American Psychiatric Association . (2013). Dia...,Post-traumatic stress in war veterans and seco...


In [None]:
@dataclass
class FastEmbed(Embeddings):
    fe: TextEmbedding = field(default_factory=TextEmbedding)

    def embed_documents(self, texts: list[str]):
        return [emb.tolist() for emb in self.fe.embed(texts)]

    def embed_query(self, text: str):
        return list(self.fe.embed([text]))[0].tolist()

In [None]:
embeddings = FastEmbed(
    TextEmbedding(
        model_name="sentence-transformers/all-MiniLM-L6-v2",  # TODO, use a medical embedding based on pubmed
        cache_dir=os.path.expanduser("~/.cache/fastembed"),
    )
)

splitter = SemanticChunker(embeddings)

loader = CSVLoader(
    file_path="data/tests.csv",
    source_column="Pmid",
    content_columns=["Article"],
    metadata_columns=[
        "Pmid",
        "Title",
        "Url",
        "Authors",
        "Keywords",
        "Journal",
        "Publication Date",
        "References",
    ],
)
documents = loader.load()

splitted_documents = splitter.split_documents(documents)

# Develop a Gold Dataset for RAG Evaluation

In [None]:
evaluator_llm = LangchainLLMWrapper(ChatDeepSeek(model="deepseek-chat"))
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)

# Create TestsetGenerator using the wrapped embeddings and the evaluator LLM
generator = TestsetGenerator(llm=evaluator_llm, embedding_model=wrapped_embeddings)

In [None]:
query_distribution = [
    (MultiHopAbstractQuerySynthesizer(llm=evaluator_llm), 0.5),
    (SingleHopSpecificQuerySynthesizer(llm=evaluator_llm), 0.5),
]

In [None]:
dataset = generator.generate_with_langchain_docs(
    splitted_documents,
    testset_size=10,  # Generate a larger dataset so we can have more of the test set too
    query_distribution=query_distribution,
)

Applying SummaryExtractor:   0%|          | 0/18 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/24 [00:00<?, ?it/s]

Node b8cbd378-733a-45d9-b139-471242db0c60 does not have a summary. Skipping filtering.
Node 34a94fb9-3a1f-408e-ac91-9db0ce074192 does not have a summary. Skipping filtering.
Node 8699f979-cec9-44d2-b569-2619c98267ef does not have a summary. Skipping filtering.
Node fd5bd1ff-ff32-4f91-b058-5b74ce6e4b2e does not have a summary. Skipping filtering.
Node b734e0b7-319f-42b0-9c31-3398d15cfb9d does not have a summary. Skipping filtering.
Node 757cdbf4-bc68-425d-9d3c-fa0c95fe3484 does not have a summary. Skipping filtering.


Applying EmbeddingExtractor:   0%|          | 0/18 [00:00<?, ?it/s]

Applying ThemesExtractor:   0%|          | 0/24 [00:00<?, ?it/s]

Applying NERExtractor:   0%|          | 0/24 [00:00<?, ?it/s]

Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
os.makedirs("RAGEvaluation", exist_ok=True)
df = dataset.to_pandas()
output_csv_path = os.path.join("RAGEvaluation", "generated_testset.csv")
df.to_csv(output_csv_path, index=False)
print(f"Generated testset saved to {output_csv_path}")

Generated testset saved to RAGEvaluation/generated_testset.csv


# Develop a RAG