#### Imports

In [3]:
import os
from utils.utils import read_json_file, save_json_file
from llms.embedding_model import EmbeddingModel
from configs.config import ConfigEnv, ConfigPath
from knowledge_graph.connection import Neo4jConnection
from llms.llm import ChatModel
from data_collection.reader import BioASQDataReader
from tqdm import tqdm
from langchain_neo4j import Neo4jVector
from retrieval_techniques.similarity_search import SimilaritySearchRetriever

In [43]:
graph_data = read_json_file(file_path=os.path.join(path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_graph_data.json")))
len(graph_data)
graph_data[0]


{'question': 'What is the implication of histone lysine methylation in medulloblastoma?',
 'answer': 'Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.',
 'id': 1682,
 'articles': [{'pmid': '23179372',
   'title': 'OTX2 sustains a bivalent-like state of OTX2-bound promoters in medulloblastoma by maintaining their H3K27me3 levels.',
   'abstract': 'Recent studies showed frequent mutations in histone H3 lysine 27 (H3K27) demethylases in medulloblastomas of Group 3 and Group 4, suggesting a role for H3K27 methylation in these tumors. Indeed, trimethylated H3K27 (H3K27me3) levels were shown to be higher in Group 3 and 4 tumors compared to WNT and SHH medulloblastomas, also in tumors without detectable mutations in demethylases. Here, we report that polycomb genes, required for H3K27 methylation, are consistently upregu

In [46]:
issued_data = [data for data in graph_data if data['id'] == 1159]
issued_data

[{'question': 'What is Trypan blue used for?',
  'answer': 'Trypan blue is used in the "trypan blue exclusion assay" for assessing cell viability/cell death.',
  'id': 1159,
  'articles': [{'pmid': '24123008',
    'title': 'Conjugated linoleate reduces prostate cancer viability whereas the effects of oleate and stearate are cell line-dependent.',
    'abstract': 'BACKGROUND: In this study, responses to fatty acid treatments in commonly used prostate cancer cell culture models and variability of gene expression between them were determined. MATERIALS AND METHODS: PC3, DU145, LNCaP, VCaP and PNT2 cells were treated with 100 muM of either oleate, stearate or conjugated linoleate. Cell proliferation and viability were assessed using trypan blue and 3-(4, 5-dimethylthiazol-2-yl)-2, 5-diphenyltetrazolium bromide (MTT) assay respectively. Gene expression was measured using real-time polymerase chain reaction (PCR). RESULTS: Conjugated linoleic acid reduced cell proliferation and viability in 

#### Initializations

In [2]:
# models
embedding_model = EmbeddingModel()

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()

# neo4j connection
neo4j_connection = Neo4jConnection(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

# retriever
vector_search_tool = VectorSimilaritySearchTool(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
    return_direct=True,
    k=4
)

similarity_retriever = SimilaritySearchRetriever(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
)

retriever_model_name = vector_search_tool.get_model_name()

# data
data_path = os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet")
reader = BioASQDataReader(samples_limit=3)
data = reader.read_parquet_file(file_path=data_path) 
print(f"Data length: {len(data)}")

2025-04-08 22:58:37,704 [DEBUG] embedding_model - CUDA is available, using GPU
2025-04-08 22:58:58,096 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings
2025-04-08 22:58:58,106 [DEBUG] llm - Initialized model gemini-2.0-flash-lite
2025-04-08 22:59:02,211 [DEBUG] connection - Connection successful!
2025-04-08 22:59:02,257 [INFO] reader - Limiting the number of rows to 3...
2025-04-08 22:59:02,259 [INFO] reader - Data file loaded with shape: (3, 4)


Data length: 3


In [3]:
similarity_retriever.get_relevant_contexts(query="Glioblastoma multiforme (GBM) is", k=4)

[{'element_id': '4:d362a474-c8ba-449f-aeb1-9e0df44dbd24:3529',
  'pmid': '24348390',
  'content': "Glioblastoma multiforme (GBM) is the most lethal subtype of glioma, classified as a WHO grade 4 infiltrative glioma. The etiology of GBM remains unknown and risk factors can be identified only in a small minority. We report the synchronous occurrence of GBM in an otherwise unrelated married couple, i.e. a husband and his wife, who developed GBM within an interval of 1 month. No specific causative environmental factors were identified for both patients, and the genetic screens were negative for hereditary syndromes. Family history was negative for tumors, and no other incidence of cancer in either siblings, parents or other children was reported. An analysis of the couple's exposure to nonionizing electromagnetic fields and ionizing radiations revealed values within the normal ranges usually found in homes. Overall, conjugal tumors are rarely reported. However, the case reported herein rai

In [10]:
results = similarity_retriever.get_1_hop_similar_contexts(query="Circular RNAs (circRNAs) are a large type of noncoding RNAs ", k=10, n_similar_contexts=5)
len(results)

13

In [11]:
results

[{'element_id': '4:d362a474-c8ba-449f-aeb1-9e0df44dbd24:3780',
  'pmid': '28634583',
  'content': "Circular RNAs (circRNAs) are currently classed as non-coding RNA (ncRNA) that, unlike linear RNAs, form covalently closed continuous loops and act as gene regulators in mammals. They were originally thought to represent errors in splicing and considered to be of low abundance, however, there is now an increased appreciation of their important function in gene regulation. circRNAs are differentially generated by backsplicing of exons or from lariat introns. Unlike linear RNA, the 3' and 5' ends normally present in an RNA molecule have been joined together by covalent bonds leading to circularization. Interestingly, they have been found to be abundant, evolutionally conserved and relatively stable in the cytoplasm. These features confer numerous potential functions to circRNAs, such as acting as miRNA sponges, or binding to RNA-associated proteins to form RNA-protein complexes that regulate

In [16]:

def func1(**kwargs):
    print(kwargs)
  
func_args = {"a":2}  
func1(**func_args)

{'a': 2}


In [None]:
existing_index = Neo4jVector.from_existing_index(
    embedding_model,
    url=ConfigEnv.NEO4J_URI,
    username=ConfigEnv.NEO4J_USER,
    password=ConfigEnv.NEO4J_PASSWORD,
    index_name="contextIndex",
    text_node_property="text_content",  # Need to define if it is not default
)

In [None]:
existing_index.similarity_search("hey")

In [None]:
neo4j_connection.execute_query("""SHOW INDEXES
       YIELD name, type, labelsOrTypes, properties, options
       WHERE type = 'VECTOR'""", params={})

In [None]:

index_name = "context_index"  # default index name

existing_graph = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=ConfigEnv.NEO4J_URI,
    username=ConfigEnv.NEO4J_USER,
    password=ConfigEnv.NEO4J_PASSWORD,
    index_name="context_index",
    node_label="CONTEXT",
    # text_node_properties=["text_content"],
    embedding_node_property="embedding",
)

In [None]:
existing_graph.search("Utilization behavior (UB) consists of reaching out and using objects in the environment in an automatic mann", search_type="mmr")

In [None]:


def run_retriever(benchmark_data: list, retriever) -> dict:
    results = {}
    for sample in tqdm(benchmark_data, desc="Executing retriever..."):
        sample_id = sample.get('id')
        question = sample.get('question')
        retrieved_data = retriever.invoke(question)
        results[sample_id] = retrieved_data
    return results

results = run_retriever(benchmark_data=data, retriever=vector_search_tool)
results

In [None]:


metrics, new_evaluator = run_evaluation_on_retrieved_chunks(
    retrieval_results=results,
    benchmark_data=data
)
print(metrics)

### RAGAS METRICS

In [None]:
vector_search_tool = VectorSimilaritySearchTool(
        llm=llm,
        embedding_model=embedding_model,
        neo4j_connection=neo4j_connection,
        return_direct=False,
    )

In [None]:
results = vector_search_tool.invoke("What is the implication of histone lysine methylation in medulloblastoma?")

In [None]:
print(results['answer'])

In [None]:
contexts = [sample['content'] for sample in results['context']]
contexts

In [None]:
from ragas.llms import LangchainLLMWrapper

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ContextRelevance

sample = SingleTurnSample(
    user_input="What is the implication of histone lysine methylation in medulloblastoma?",
    retrieved_contexts=contexts
)

scorer = ContextRelevance(llm=evaluator_llm)
score = await scorer.single_turn_ascore(sample)
print(score)

In [None]:
from ragas import EvaluationDataset


dataset = []
dataset.append(
        {
            "id": 1,
            "user_input": "What is the implication of histone lysine methylation in medulloblastoma?",
            "retrieved_contexts": contexts,
            "response": "Histone lysine methylation, particularly at H3K9, is implicated in the pathogenesis of medulloblastoma. Copy number aberrations in genes involved in writing, reading, removing, and blocking histone lysine methylation suggest that defective control of the histone code contributes to the development of this cancer. Additionally, the study found that restoration of expression of genes controlling H3K9 methylation greatly diminishes proliferation of medulloblastoma in vitro.",
            "reference": "Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.", # expected response
        }
    )
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextPrecisionWithReference, ContextRecall, ResponseRelevancy, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embedding = LangchainEmbeddingsWrapper(embedding_model)

context_precision = LLMContextPrecisionWithReference()
context_recall = ContextRecall()
response_relevancy = ResponseRelevancy()
factual_correctness = FactualCorrectness()

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[context_precision, context_recall, response_relevancy, factual_correctness],
    llm=evaluator_llm,
    embeddings=evaluator_embedding
)

result

In [None]:
# read paquet data
import os
import pandas as pd
from configs.config import ConfigPath

from data_collection.reader import BioASQDataReader

In [None]:
asq_reader = BioASQDataReader()
data = asq_reader.read_parquet_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet"))

In [None]:
for sample in data:
    if 20007090 in sample["relevant_passage_ids"]:
        print(sample)
        print(data.index(sample))
        break

In [None]:
asq_reader.get_data_to_dict()

In [None]:
from data_collection.fetcher import PubMedArticleFetcher

fetcher = PubMedArticleFetcher()

In [None]:
results = fetcher.fetch_articles(pmids=['20007090'])