#### Imports

In [1]:
import os
from configs import ConfigPath
from utils.utils import read_json_file
from llms.embedding_model import EmbeddingModel
from configs.config import ConfigEnv
from knowledge_graph.connection import Neo4jConnection
from llms.llm import ChatModel
from retrieval.tools.vector_search_tool import VectorSearchTool
from answer_pip_executor import AnswerPipExecutor
from evaluation.evaluation import Evaluator
from evaluation.retrieval_evaluation import compute_scores_for_single_qa

#### Initializations

In [2]:
# models
embedding_model = EmbeddingModel()

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()

# neo4j connection
neo4j_connection = Neo4jConnection(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

answer_type = "short"

# retriever
vector_search_tool = VectorSearchTool(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
    return_direct=True,
    answer_type=answer_type,
)

retriever_model_name = vector_search_tool.get_model_name()

# data
# data = read_json_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "test_pqa.json"))  
# print(f"Data length: {len(data)}")

2025-04-01 00:30:29,785 [DEBUG] embedding_model - CUDA is available, using GPU


2025-04-01 00:30:50,464 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings
2025-04-01 00:30:50,510 [DEBUG] llm - Initialized model gemini-2.0-flash-lite
2025-04-01 00:30:54,645 [DEBUG] connection - Connection successful!


In [7]:
results = vector_search_tool.invoke("Is STAT3 transcription factor regulated by mTORC1?")
retrieved_pmdis = [element['pmid'] for element in results]
ground_truth_pmids = ["22055460", "23641065", "26026060", "24302004", "24931163" ]
retrieved_pmdis

['23641065', '22055460']

In [8]:
compute_scores_for_single_qa(retrieved_ids=retrieved_pmdis, ground_truth_ids=ground_truth_pmids)

Retrieved (k=2): ['23641065', '22055460']
Ground Truth: ['22055460', '23641065', '26026060', '24302004', '24931163']
------------------------------
Precision: 1.0000
Recall:    0.4000
F1-Score:  0.5714
Hit Rate:  1.0000
RR:        1.0000
AP:        0.4000


NameError: name 'retrieved' is not defined

In [None]:
answer_pip_executor = AnswerPipExecutor(source_data=data)
results = answer_pip_executor.generate_answers(retriever=vector_search_tool, model_name=retriever_model_name, answer_type=answer_type)

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df['response_match'] = results_df['actual_response'] == results_df['generated_response']

In [None]:
results_df.head()

In [None]:
results_df[["context_found", "context_order", "response_match"]]

In [None]:
evaluator = Evaluator(ground_truth_data=data, short_answer_results=results)
evaluator.evaluate()

In [None]:
evaluator.compute_short_answer_metrics()

In [None]:
contexts = [element['content'] for element in results['context']]
contexts

In [None]:
from ragas import EvaluationDataset


dataset = []
dataset.append(
        {
            "user_input": "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?",
            "retrieved_contexts": contexts,
            "response": "yes",
            "reference": "yes", # expected response
        }
    )
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import ContextPrecision, ContextRecall, ResponseRelevancy, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embedding = LangchainEmbeddingsWrapper(embedding_model)

context_precision = ContextPrecision()
context_recall = ContextRecall()
response_relevancy = ResponseRelevancy()
factual_correctness = FactualCorrectness()

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[context_precision, context_recall, response_relevancy, factual_correctness],
    llm=evaluator_llm,
    embeddings=evaluator_embedding
)

result

In [3]:
# read paquet data
import os
import pandas as pd
from configs.config import ConfigPath

from data_collection.reader import BioASQDataReader

In [4]:
asq_reader = BioASQDataReader()
data = asq_reader.read_parquet_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet"))

2025-03-26 21:18:22,783 [INFO] reader - Limiting the number of rows to 1000...
2025-03-26 21:18:22,784 [INFO] reader - Data file loaded with shape: (1000, 4)


In [10]:
for sample in data:
    if 20007090 in sample["relevant_passage_ids"]:
        print(sample)
        print(data.index(sample))
        break

{'question': 'What is Trypan blue used for?', 'answer': 'Trypan blue is used in the "trypan blue exclusion assay" for assessing cell viability/cell death.', 'id': 1159, 'relevant_passage_ids': array([24123008, 24228750, 24127887, 24231574, 24228508, 24289578,
       24244400, 24160177, 20007090, 24300339, 24195509, 24316855,
       24304568, 24236478, 24129092, 24296136, 24256980, 24279639,
       24320727, 24238300, 23354080, 24140394, 24139500, 24190701,
       24312318], dtype=int64)}
213


In [3]:
asq_reader.get_data_to_dict()

[{'question': 'What is the implication of histone lysine methylation in medulloblastoma?',
  'answer': 'Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.',
  'id': 1682,
  'relevant_passage_ids': array([23179372, 19270706, 23184418], dtype=int64)},
 {'question': 'What is the role of STAG1/STAG2 proteins in differentiation?',
  'answer': 'STAG1/STAG2 proteins are tumour suppressor proteins that suppress cell proliferation and are essential for differentiation.',
  'id': 3722,
  'relevant_passage_ids': array([26997282, 21589869, 19822671, 29867216, 15361841, 28430577,
         27298259, 12034751, 18276799], dtype=int64)},
 {'question': 'What is the association between cell phone use and glioblastoma?',
  'answer': 'The association between cell phone use and incident glioblastoma remains unclear. Some studies have rep

In [12]:
from data_collection.fetcher import PubMedArticleFetcher

fetcher = PubMedArticleFetcher()

In [13]:
results = fetcher.fetch_articles(pmids=['20007090'])

2025-03-26 21:22:38,865 [INFO] fetcher - Fetching articles for total PMIDs: 1
Extracting Pubmed data from articles: 100%|██████████| 1/1 [00:00<?, ?it/s]
2025-03-26 21:22:39,589 [INFO] fetcher - Articles saved successfully.


In [5]:
from utils.utils import read_json_file
import os


data = read_json_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_graph_data.json"))
data[0]

{'question': 'What is the implication of histone lysine methylation in medulloblastoma?',
 'answer': 'Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.',
 'id': 1682,
 'articles': [{'pmid': '23179372',
   'title': 'OTX2 sustains a bivalent-like state of OTX2-bound promoters in medulloblastoma by maintaining their H3K27me3 levels.',
   'abstract': 'Recent studies showed frequent mutations in histone H3 lysine 27 (H3K27) demethylases in medulloblastomas of Group 3 and Group 4, suggesting a role for H3K27 methylation in these tumors. Indeed, trimethylated H3K27 (H3K27me3) levels were shown to be higher in Group 3 and 4 tumors compared to WNT and SHH medulloblastomas, also in tumors without detectable mutations in demethylases. Here, we report that polycomb genes, required for H3K27 methylation, are consistently upregu

In [1]:
from data_collection.fetcher import MeshTermFetcher

In [2]:
mesh_fetcher = MeshTermFetcher()
results = mesh_fetcher.fetch_definitions(mesh_terms=['Carbon Dioxide', 'Twist-Related Protein 1'])
results

2025-03-26 21:08:01,458 [DEBUG] fetcher - Working on file: definitions_test.json
100%|██████████| 2/2 [00:02<00:00,  1.28s/it]


{'Nuclear Proteins': 'Nuclear Proteins\nProteins found in the nucleus of a cell. Do not confuse with NUCLEOPROTEINS which\nare proteins conjugated with nucleic acids, that are not necessarily present in\nthe nucleus.',
 'Histone-Lysine N-Methyltransferase': 'Histone-Lysine N-Methyltransferase\nAn enzyme that catalyzes the methylation of the epsilon-amino group of lysine\nresidues in proteins to yield epsilon mono-, di-, and trimethyllysine.',
 'DNA-Binding Proteins': 'DNA-Binding Proteins\nProteins which bind to DNA. The family includes proteins which bind to both\ndouble- and single-stranded DNA and also includes specific DNA binding proteins\nin serum which can be used as markers for malignant diseases.',
 'Otx Transcription Factors': 'Otx Transcription Factors\nA family of VERTEBRATE homeodomain proteins that share homology with\northodenticle protein, Drosophila. They regulate GENETIC TRANSCRIPTION and play\nan important role in EMBRYONIC DEVELOPMENT of the BRAIN.',
 'Genes, Tumor 