#### Imports

In [1]:
import os
from utils.utils import read_json_file, save_json_file
from llms.embedding_model import EmbeddingModel
from configs.config import ConfigEnv, ConfigPath
from knowledge_graph.connection import Neo4jConnection
from llms.llm import ChatModel
from data_collection.reader import BioASQDataReader
from tqdm import tqdm
from langchain_neo4j import Neo4jVector
from retrieval_techniques.similarity_search import SimilaritySearchRetriever

#### Initializations

In [2]:
# models
embedding_model = EmbeddingModel()

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()

# neo4j connection
neo4j_connection = Neo4jConnection(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

# retriever
similarity_retriever = SimilaritySearchRetriever(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
)

# data
data_path = os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet")
reader = BioASQDataReader(samples_limit=3)
data = reader.read_parquet_file(file_path=data_path) 
print(f"Data length: {len(data)}")

2025-04-10 21:03:20,785 [DEBUG] embedding_model - CUDA is available, using GPU
2025-04-10 21:03:40,795 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings
2025-04-10 21:03:40,810 [DEBUG] llm - Initialized model gemini-2.0-flash-lite


Using database: bioasq1000


2025-04-10 21:03:44,952 [DEBUG] connection - Connection successful!
2025-04-10 21:03:45,016 [INFO] reader - Limiting the number of rows to 3...
2025-04-10 21:03:45,018 [INFO] reader - Data file loaded with shape: (3, 4)


Data length: 3


In [5]:
similarity_retriever = SimilaritySearchRetriever(llm=llm, embedding_model=embedding_model, neo4j_connection=neo4j_connection)
results = similarity_retriever.perform_retrieval(retrieval_type="mesh_centrality_contexts",
                                                 query="What is the aim of iodine prophylaxis?",
                                                 k=10, 
                                                 centrality_type="degree")
pmids_found = [r['pmid'] for r in results]
pmids_found

Centrality type: degree


['21415143',
 '10566200',
 '17160166',
 '20810577',
 '22009156',
 '17205086',
 '23786024',
 '11932302',
 '18290900',
 '12800543']

In [None]:

index_name = "context_index"  # default index name

existing_graph = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=ConfigEnv.NEO4J_URI,
    username=ConfigEnv.NEO4J_USER,
    password=ConfigEnv.NEO4J_PASSWORD,
    index_name="context_index",
    node_label="CONTEXT",
    # text_node_properties=["text_content"],
    embedding_node_property="embedding",
)

In [None]:
existing_graph.search("Utilization behavior (UB) consists of reaching out and using objects in the environment in an automatic mann", search_type="mmr")

In [None]:


def run_retriever(benchmark_data: list, retriever) -> dict:
    results = {}
    for sample in tqdm(benchmark_data, desc="Executing retriever..."):
        sample_id = sample.get('id')
        question = sample.get('question')
        retrieved_data = retriever.invoke(question)
        results[sample_id] = retrieved_data
    return results

results = run_retriever(benchmark_data=data, retriever=vector_search_tool)
results

In [None]:


metrics, new_evaluator = run_evaluation_on_retrieved_chunks(
    retrieval_results=results,
    benchmark_data=data
)
print(metrics)

### RAGAS METRICS

In [None]:
vector_search_tool = VectorSimilaritySearchTool(
        llm=llm,
        embedding_model=embedding_model,
        neo4j_connection=neo4j_connection,
        return_direct=False,
    )

In [None]:
results = vector_search_tool.invoke("What is the implication of histone lysine methylation in medulloblastoma?")

In [None]:
print(results['answer'])

In [None]:
contexts = [sample['content'] for sample in results['context']]
contexts

In [None]:
from ragas.llms import LangchainLLMWrapper

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ContextRelevance

sample = SingleTurnSample(
    user_input="What is the implication of histone lysine methylation in medulloblastoma?",
    retrieved_contexts=contexts
)

scorer = ContextRelevance(llm=evaluator_llm)
score = await scorer.single_turn_ascore(sample)
print(score)

In [None]:
from ragas import EvaluationDataset


dataset = []
dataset.append(
        {
            "id": 1,
            "user_input": "What is the implication of histone lysine methylation in medulloblastoma?",
            "retrieved_contexts": contexts,
            "response": "Histone lysine methylation, particularly at H3K9, is implicated in the pathogenesis of medulloblastoma. Copy number aberrations in genes involved in writing, reading, removing, and blocking histone lysine methylation suggest that defective control of the histone code contributes to the development of this cancer. Additionally, the study found that restoration of expression of genes controlling H3K9 methylation greatly diminishes proliferation of medulloblastoma in vitro.",
            "reference": "Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.", # expected response
        }
    )
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextPrecisionWithReference, ContextRecall, ResponseRelevancy, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embedding = LangchainEmbeddingsWrapper(embedding_model)

context_precision = LLMContextPrecisionWithReference()
context_recall = ContextRecall()
response_relevancy = ResponseRelevancy()
factual_correctness = FactualCorrectness()

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[context_precision, context_recall, response_relevancy, factual_correctness],
    llm=evaluator_llm,
    embeddings=evaluator_embedding
)

result

In [None]:
# read paquet data
import os
import pandas as pd
from configs.config import ConfigPath

from data_collection.reader import BioASQDataReader

In [None]:
asq_reader = BioASQDataReader()
data = asq_reader.read_parquet_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet"))

In [None]:
for sample in data:
    if 20007090 in sample["relevant_passage_ids"]:
        print(sample)
        print(data.index(sample))
        break

In [None]:
asq_reader.get_data_to_dict()

In [None]:
from data_collection.fetcher import PubMedArticleFetcher

fetcher = PubMedArticleFetcher()

In [None]:
results = fetcher.fetch_articles(pmids=['20007090'])