#### Imports

In [1]:
import os
from configs import ConfigPath
from utils.utils import read_json_file
from llms.embedding_model import EmbeddingModel
from configs.config import ConfigEnv
from knowledge_graph.connection import Neo4jConnection
from llms.llm import ChatModel
from retrieval.tools.vector_search_tool import VectorSimilaritySearchTool
from data_collection.reader import BioASQDataReader
from tqdm import tqdm
from langchain_neo4j import Neo4jVector
from configs.config import ConfigEnv
from retrieval_techniques.similarity_search import SimilaritySearchRetriever

#### Initializations

In [2]:
# models
embedding_model = EmbeddingModel()

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()

# neo4j connection
neo4j_connection = Neo4jConnection(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

# retriever
vector_search_tool = VectorSimilaritySearchTool(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
    return_direct=True,
    k=4
)

similarity_retriever = SimilaritySearchRetriever(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
)

retriever_model_name = vector_search_tool.get_model_name()

# data
data_path = os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet")
reader = BioASQDataReader(samples_limit=3)
data = reader.read_parquet_file(file_path=data_path) 
print(f"Data length: {len(data)}")

2025-04-07 01:22:43,566 [DEBUG] embedding_model - CUDA is available, using GPU
2025-04-07 01:23:06,346 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings
2025-04-07 01:23:06,359 [DEBUG] llm - Initialized model gemini-2.0-flash-lite
2025-04-07 01:23:10,476 [DEBUG] connection - Connection successful!
2025-04-07 01:23:10,518 [INFO] reader - Limiting the number of rows to 3...
2025-04-07 01:23:10,521 [INFO] reader - Data file loaded with shape: (3, 4)


Data length: 3


In [12]:
def func_a(input: str):
    print(f"Executing func_a with input: {input}")
    
def func_b(input: str):
    print(f"Executing func_b with input: {input}")

def execute_func(**kwargs):
    mapper = {
        "func_a": func_a,
        "func_b": func_b,
    }
    func_name = kwargs.pop("func_name")
    func = mapper.get(func_name)
    if func:
        func_input = kwargs.pop("input", None)
        func(func_input)
    else:
        print(f"Function {func_name} not found.")
        

result = execute_func(func_name="func_a", input="Hello World")
result

Executing func_a with input: Hello World


In [3]:
similarity_retriever.get_relevant_contexts(query="Glioblastoma multiforme (GBM) is", k=4)

[{'pmid': '24348390',
  'content': "Glioblastoma multiforme (GBM) is the most lethal subtype of glioma, classified as a WHO grade 4 infiltrative glioma. The etiology of GBM remains unknown and risk factors can be identified only in a small minority. We report the synchronous occurrence of GBM in an otherwise unrelated married couple, i.e. a husband and his wife, who developed GBM within an interval of 1 month. No specific causative environmental factors were identified for both patients, and the genetic screens were negative for hereditary syndromes. Family history was negative for tumors, and no other incidence of cancer in either siblings, parents or other children was reported. An analysis of the couple's exposure to nonionizing electromagnetic fields and ionizing radiations revealed values within the normal ranges usually found in homes. Overall, conjugal tumors are rarely reported. However, the case reported herein raises important questions about possible etiologic factors.",
  '

In [4]:
similarity_retriever.get_answer_based_on_contexts(query="Glioblastoma multiforme (GBM) is", k=4)

{'answer': 'Glioblastoma multiforme (GBM) is the most lethal subtype of glioma, classified as a WHO grade 4 infiltrative glioma. The context states that GBM is a type of brain tumor.'}

In [3]:
vector_search_tool.invoke("Glioblastoma multiforme (GBM) is ")

[{'pmid': '24348390',
  'content': "Glioblastoma multiforme (GBM) is the most lethal subtype of glioma, classified as a WHO grade 4 infiltrative glioma. The etiology of GBM remains unknown and risk factors can be identified only in a small minority. We report the synchronous occurrence of GBM in an otherwise unrelated married couple, i.e. a husband and his wife, who developed GBM within an interval of 1 month. No specific causative environmental factors were identified for both patients, and the genetic screens were negative for hereditary syndromes. Family history was negative for tumors, and no other incidence of cancer in either siblings, parents or other children was reported. An analysis of the couple's exposure to nonionizing electromagnetic fields and ionizing radiations revealed values within the normal ranges usually found in homes. Overall, conjugal tumors are rarely reported. However, the case reported herein raises important questions about possible etiologic factors.",
  '

In [15]:

embedded_query = embedding_model.embed_query("Glioblastoma multiforme (GBM) is the most lethal subtype of glioma")


vector_query = f"""CALL db.index.vector.queryNodes('contextIndex', $k, {embedded_query})
YIELD node AS context, score
MATCH (article:ARTICLE)-[:HAS_CONTEXT]->(context)
RETURN article.pmid as pmid, context.text_content as content, score as score"""

# vector_query = f"""CALL db.index.vector.queryNodes('contextIndex', 5, {embedded_query})
# YIELD node AS context_node, score
# RETURN context_node.text_content AS content, score"""

neo4j_connection.execute_query(vector_query, params={"k":5, "threshold":0.5})

[{'pmid': '24348390',
  'content': "Glioblastoma multiforme (GBM) is the most lethal subtype of glioma, classified as a WHO grade 4 infiltrative glioma. The etiology of GBM remains unknown and risk factors can be identified only in a small minority. We report the synchronous occurrence of GBM in an otherwise unrelated married couple, i.e. a husband and his wife, who developed GBM within an interval of 1 month. No specific causative environmental factors were identified for both patients, and the genetic screens were negative for hereditary syndromes. Family history was negative for tumors, and no other incidence of cancer in either siblings, parents or other children was reported. An analysis of the couple's exposure to nonionizing electromagnetic fields and ionizing radiations revealed values within the normal ranges usually found in homes. Overall, conjugal tumors are rarely reported. However, the case reported herein raises important questions about possible etiologic factors.",
  '

In [3]:
existing_index = Neo4jVector.from_existing_index(
    embedding_model,
    url=ConfigEnv.NEO4J_URI,
    username=ConfigEnv.NEO4J_USER,
    password=ConfigEnv.NEO4J_PASSWORD,
    index_name="contextIndex",
    text_node_property="text_content",  # Need to define if it is not default
)

ValueError: The specified vector index name does not exist. Make sure to check if you spelled it correctly

In [35]:
existing_index.similarity_search("hey")

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'text_content': expected an expression, 'FOREACH', ',', 'AS', 'ORDER BY', 'CALL', 'CREATE', 'LOAD CSV', 'DELETE', 'DETACH', 'FINISH', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REMOVE', 'RETURN', 'SET', 'SKIP', 'UNION', 'UNWIND', 'USE', 'WITH' or <EOF> (line 1, column 164 (offset: 163))
"CALL db.index.vector.queryNodes($vector_index_name, $top_k * $effective_search_ratio, $query_vector) YIELD node, score WITH node, score LIMIT $top_k RETURN node.``text_content`` AS text, score, node {.*, ``text_content``: Null, `embedding`: Null, id: Null } AS metadata"
                                                                                                                                                                    ^}

In [28]:
neo4j_connection.execute_query("""SHOW INDEXES
       YIELD name, type, labelsOrTypes, properties, options
       WHERE type = 'VECTOR'""", params={})

[{'name': 'context_index',
  'type': 'VECTOR',
  'labelsOrTypes': ['CONTEXT'],
  'properties': ['embedding'],
  'options': {'indexProvider': 'vector-2.0',
   'indexConfig': {'vector.hnsw.m': 16,
    'vector.hnsw.ef_construction': 100,
    'vector.dimensions': 768,
    'vector.similarity_function': 'COSINE',
    'vector.quantization.enabled': True}}},
 {'name': 'mesh_index',
  'type': 'VECTOR',
  'labelsOrTypes': ['MESH'],
  'properties': ['embedding'],
  'options': {'indexProvider': 'vector-2.0',
   'indexConfig': {'vector.hnsw.m': 16,
    'vector.hnsw.ef_construction': 100,
    'vector.dimensions': 768,
    'vector.similarity_function': 'COSINE',
    'vector.quantization.enabled': True}}}]

In [27]:

index_name = "context_index"  # default index name

existing_graph = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=ConfigEnv.NEO4J_URI,
    username=ConfigEnv.NEO4J_USER,
    password=ConfigEnv.NEO4J_PASSWORD,
    index_name="context_index",
    node_label="CONTEXT",
    # text_node_properties=["text_content"],
    embedding_node_property="embedding",
)

TypeError: Neo4jVector.from_existing_graph() missing 1 required positional argument: 'text_node_properties'

In [26]:
existing_graph.search("Utilization behavior (UB) consists of reaching out and using objects in the environment in an automatic mann", search_type="mmr")

[]

In [3]:


def run_retriever(benchmark_data: list, retriever) -> dict:
    results = {}
    for sample in tqdm(benchmark_data, desc="Executing retriever..."):
        sample_id = sample.get('id')
        question = sample.get('question')
        retrieved_data = retriever.invoke(question)
        results[sample_id] = retrieved_data
    return results

results = run_retriever(benchmark_data=data, retriever=vector_search_tool)
results

Executing retriever...: 100%|██████████| 3/3 [00:00<00:00,  3.21it/s]


{1682: [{'pmid': '26598646',
   'content': 'Covalent post-translational modifications (PTMs) of proteins can regulate the structural and functional state of a protein in the absence of primary changes in the underlying sequence. Common PTMs include phosphorylation, acetylation, and methylation. Histone proteins are critical regulators of the genome and are subject to a highly abundant and diverse array of PTMs. To highlight the functional complexity added to the proteome by lysine methylation signaling, here we will focus on lysine methylation of histone proteins, an important modification in the regulation of chromatin and epigenetic processes. We review the signaling pathways and functions associated with a single residue, H4K20, as a model chromatin and clinically important mark that regulates biological processes ranging from the DNA damage response and DNA replication to gene expression and silencing.',
   'score': 0.8392342925071716},
  {'pmid': '20735237',
   'content': 'The mol

In [None]:


metrics, new_evaluator = run_evaluation_on_retrieved_chunks(
    retrieval_results=results,
    benchmark_data=data
)
print(metrics)

defaultdict(<class 'dict'>, {'precision@1': 0.6666666666666666, 'recall@1': 0.056644880174291944, 'f1@1': 0.1037037037037037, 'mrr@1': 0.6666666666666666, 'ndcg@1': 0.6666666666666666, 'success@1': 0.6666666666666666, 'map@1': 0.056644880174291944, 'coverage@1': 0.06896551724137931, 'precision@3': 0.7777777777777777, 'recall@3': 0.24400871459694992, 'f1@3': 0.3323232323232323, 'mrr@3': 0.7777777777777777, 'ndcg@3': 0.6666666666666666, 'success@3': 1.0, 'map@3': 0.1699346405228758, 'coverage@3': 0.20689655172413793, 'precision@5': 0.6666666666666666, 'recall@5': 0.26361655773420484, 'f1@5': 0.3257575757575757, 'mrr@5': 0.7777777777777777, 'ndcg@5': 0.5959286110065073, 'success@5': 1.0, 'map@5': 0.20915032679738563, 'coverage@5': 0.2413793103448276, 'precision@10': 0.6222222222222222, 'recall@10': 0.4139433551198257, 'f1@10': 0.37762237762237766, 'mrr@10': 0.7777777777777777, 'ndcg@10': 0.5633377760765029, 'success@10': 1.0, 'map@10': 0.30980392156862746, 'coverage@10': 0.344827586206896

### RAGAS METRICS

In [6]:
vector_search_tool = VectorSimilaritySearchTool(
        llm=llm,
        embedding_model=embedding_model,
        neo4j_connection=neo4j_connection,
        return_direct=False,
    )

In [11]:
results = vector_search_tool.invoke("What is the implication of histone lysine methylation in medulloblastoma?")

In [27]:
print(results['answer'])

Histone lysine methylation, particularly at H3K9, is implicated in the pathogenesis of medulloblastoma. Copy number aberrations in genes involved in writing, reading, removing, and blocking histone lysine methylation suggest that defective control of the histone code contributes to the development of this cancer. Additionally, the study found that restoration of expression of genes controlling H3K9 methylation greatly diminishes proliferation of medulloblastoma in vitro.


In [17]:
contexts = [sample['content'] for sample in results['context']]
contexts

['Covalent post-translational modifications (PTMs) of proteins can regulate the structural and functional state of a protein in the absence of primary changes in the underlying sequence. Common PTMs include phosphorylation, acetylation, and methylation. Histone proteins are critical regulators of the genome and are subject to a highly abundant and diverse array of PTMs. To highlight the functional complexity added to the proteome by lysine methylation signaling, here we will focus on lysine methylation of histone proteins, an important modification in the regulation of chromatin and epigenetic processes. We review the signaling pathways and functions associated with a single residue, H4K20, as a model chromatin and clinically important mark that regulates biological processes ranging from the DNA damage response and DNA replication to gene expression and silencing.',
 'The molecular biology of histone H4 lysine 20 (H4K20) methylation, like many other post-translational modifications of

In [24]:
from ragas.llms import LangchainLLMWrapper

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()
evaluator_llm = LangchainLLMWrapper(llm)

2025-04-03 20:28:54,617 [DEBUG] llm - Initialized model gemini-2.0-flash-lite


In [25]:

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ContextRelevance

sample = SingleTurnSample(
    user_input="What is the implication of histone lysine methylation in medulloblastoma?",
    retrieved_contexts=contexts
)

scorer = ContextRelevance(llm=evaluator_llm)
score = await scorer.single_turn_ascore(sample)
print(score)

1.0


In [32]:
from ragas import EvaluationDataset


dataset = []
dataset.append(
        {
            "id": 1,
            "user_input": "What is the implication of histone lysine methylation in medulloblastoma?",
            "retrieved_contexts": contexts,
            "response": "Histone lysine methylation, particularly at H3K9, is implicated in the pathogenesis of medulloblastoma. Copy number aberrations in genes involved in writing, reading, removing, and blocking histone lysine methylation suggest that defective control of the histone code contributes to the development of this cancer. Additionally, the study found that restoration of expression of genes controlling H3K9 methylation greatly diminishes proliferation of medulloblastoma in vitro.",
            "reference": "Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.", # expected response
        }
    )
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [34]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextPrecisionWithReference, ContextRecall, ResponseRelevancy, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embedding = LangchainEmbeddingsWrapper(embedding_model)

context_precision = LLMContextPrecisionWithReference()
context_recall = ContextRecall()
response_relevancy = ResponseRelevancy()
factual_correctness = FactualCorrectness()

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[context_precision, context_recall, response_relevancy, factual_correctness],
    llm=evaluator_llm,
    embeddings=evaluator_embedding
)

result

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

{'llm_context_precision_with_reference': 0.6556, 'context_recall': 1.0000, 'answer_relevancy': 0.8296, 'factual_correctness(mode=f1)': 0.5700}

In [None]:
# read paquet data
import os
import pandas as pd
from configs.config import ConfigPath

from data_collection.reader import BioASQDataReader

In [None]:
asq_reader = BioASQDataReader()
data = asq_reader.read_parquet_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet"))

In [None]:
for sample in data:
    if 20007090 in sample["relevant_passage_ids"]:
        print(sample)
        print(data.index(sample))
        break

In [None]:
asq_reader.get_data_to_dict()

In [None]:
from data_collection.fetcher import PubMedArticleFetcher

fetcher = PubMedArticleFetcher()

In [None]:
results = fetcher.fetch_articles(pmids=['20007090'])