#### Imports

In [None]:
import os
from utils.utils import read_json_file, save_json_file
from llms.embedding_model import EmbeddingModel
from configs.config import ConfigEnv, ConfigPath
from knowledge_graph.connection import Neo4jConnection
from llms.llm import ChatModel
from data_collection.reader import BioASQDataReader
from tqdm import tqdm
from langchain_neo4j import Neo4jVector
from retrieval_techniques.similarity_search import SimilaritySearchRetriever

from utils.utils import num_tokens_from_string


ImportError: cannot import name 'logger' from 'configs.config' (c:\Users\melis\Desktop\Projects\python_projects\thesis\graph_rag_techniques\src\configs\config.py)

In [None]:
data_path = os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet")
reader = BioASQDataReader(samples_limit=3)
data = reader.read_parquet_file(file_path=data_path) 
print(f"Data length: {len(data)}")

2025-04-20 17:42:11,571 [INFO] reader - Limiting the number of rows to 3...
2025-04-20 17:42:11,572 [INFO] reader - Data file loaded with shape: (3, 4)


Data length: 3


In [None]:
data[0]

{'question': 'What is the implication of histone lysine methylation in medulloblastoma?',
 'answer': 'Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.',
 'id': 1682,
 'relevant_passage_ids': ['23179372', '19270706', '23184418']}

#### Initializations

In [None]:
# models
embedding_model = EmbeddingModel()

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()

# neo4j connection
neo4j_connection = Neo4jConnection(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

# retriever
similarity_retriever = SimilaritySearchRetriever(
    llm=llm,
    embedding_model=embedding_model,
    neo4j_connection=neo4j_connection,
)

# data
data_path = os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet")
reader = BioASQDataReader(samples_limit=3)
data = reader.read_parquet_file(file_path=data_path) 
print(f"Data length: {len(data)}")

2025-04-20 17:49:42,126 [DEBUG] embedding_model - CUDA is available, using GPU


2025-04-20 17:49:53,155 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings
2025-04-20 17:49:53,170 [DEBUG] llm - Initialized model gemini-2.0-flash-lite


Using database: bioasq1000


2025-04-20 17:49:57,252 [DEBUG] connection - Connection successful!
2025-04-20 17:49:57,287 [INFO] reader - Limiting the number of rows to 3...
2025-04-20 17:49:57,287 [INFO] reader - Data file loaded with shape: (3, 4)


Data length: 3


In [None]:
data_loader = Neo4jDataLoader(neo4j_driver=neo4j_connection.get_driver(), embedding_model=embedding_model)

In [None]:
result = data_loader.extract_subgraph(
    question="What is the implication of histone lysine methylation in medulloblastoma?",
    relevant_passage_ids=['23179372', '19270706', '23184418']
)
result



<neo4j._sync.work.result.Result object at 0x000001BA117115D0>




HeteroData(
  question_embedding=[768],
  context_id_map={
    39276=0,
    39277=1,
    39278=2,
    39793=3,
    46910=4,
    43668=5,
    39787=6,
    47913=7,
    39789=8,
    47916=9,
    47920=10,
  },
  original_context_ids={
    0=39276,
    1=39277,
    2=39278,
    3=39793,
    4=46910,
    5=43668,
    6=39787,
    7=47913,
    8=39789,
    9=47916,
    10=47920,
  },
  context={
    x=[11, 768],
    y=[11],
  },
  mesh={ x=[9, 768] },
  (context, similar_to, context)={
    edge_index=[2, 3],
    edge_attr=[3, 1],
  },
  (context, has_mesh, mesh)={ edge_index=[2, 10] },
  (mesh, rev_has_mesh, context)={ edge_index=[2, 10] }
)

In [None]:
result['mesh'].x.shape

torch.Size([9, 768])

In [None]:
similarity_retriever = SimilaritySearchRetriever(llm=llm, embedding_model=embedding_model, neo4j_connection=neo4j_connection)
# results = similarity_retriever.perform_retrieval(retrieval_type="mesh_centrality_contexts",
#                                                  query="What is the implication of histone lysine methylation in medulloblastoma?",
#                                                  k=3, 
#                                                  centrality_type="degree")
# pmids_found = [r['pmid'] for r in results]
# pmids_found

In [None]:
similarity_retriever.perform_enhanced_mesh_search(query="Does oncogene-induced DNA replication stress inhibit genomic instability?", k=5, n_meshes=10)

[{'pmid': '23466526',
  'content': "Mowat-Wilson syndrome (MWS) is a severe intellectual disability (ID)-distinctive facial gestalt-multiple congenital anomaly syndrome, commonly associating microcephaly, epilepsy, corpus callosum agenesis, conotruncal heart defects, urogenital malformations and Hirschsprung disease (HSCR). MWS is caused by de novo heterozygous mutations in the ZEB2 gene. The majority of mutations lead to haplo-insufficiency through premature stop codons or large gene deletions. Only three missense mutations have been reported so far; none of which resides in a known functional domain of ZEB2. In this study, we report and analyze the functional consequences of three novel missense mutations, p.Tyr1055Cys, p.Ser1071Pro and p.His1045Arg, identified in the highly conserved C-zinc-finger (C-ZF) domain of ZEB2. Patients' phenotype included the facial gestalt of MWS and moderate ID, but no microcephaly, heart defects or HSCR. In vitro studies showed that all the three mutati

In [None]:
mesh_results = similarity_retriever.get_relevant_meshes(query="What is the implication of histone lysine methylation in medulloblastoma?", k=5)
mesh_results

[{'term': 'Histone Methyltransferases',
  'definition': 'Histone Methyltransferases\nEnzymes that catalyze the transfer of methyl groups to LYSINE or ARGININE\nresidues of HISTONES, especially histone H3 and histone H4 proteins. They play a \ncritical role in EPIGENETIC PROCESSES.',
  'score': 0.8065643310546875},
 {'term': 'Histone Methyltransferases',
  'definition': 'Histone Methyltransferases\nEnzymes that catalyze the transfer of methyl groups to LYSINE or ARGININE\nresidues of HISTONES, especially histone H3 and histone H4 proteins. They play a \ncritical role in EPIGENETIC PROCESSES.',
  'score': 0.8064651489257812},
 {'term': 'Histone-Lysine N-Methyltransferase',
  'definition': 'Histone-Lysine N-Methyltransferase\nAn enzyme that catalyzes the methylation of the epsilon-amino group of lysine\nresidues in proteins to yield epsilon mono-, di-, and trimethyllysine.',
  'score': 0.8031783103942871},
 {'term': 'Histone-Lysine N-Methyltransferase',
  'definition': 'Histone-Lysine N-M

In [None]:
mesh_terms = [mesh['term'] for mesh in mesh_results]
mesh_terms

['Histone Methyltransferases',
 'Histone Methyltransferases',
 'Histone-Lysine N-Methyltransferase',
 'Histone-Lysine N-Methyltransferase',
 'DNA Methyltransferase 3B']

### RAGAS METRICS

In [None]:
vector_search_tool = VectorSimilaritySearchTool(
        llm=llm,
        embedding_model=embedding_model,
        neo4j_connection=neo4j_connection,
        return_direct=False,
    )

In [None]:
results = vector_search_tool.invoke("What is the implication of histone lysine methylation in medulloblastoma?")

In [None]:
print(results['answer'])

In [None]:
contexts = [sample['content'] for sample in results['context']]
contexts

In [None]:
from ragas.llms import LangchainLLMWrapper

llm = ChatModel(provider="google", model_name="gemini-2.0-flash-lite").initialize_model()
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ContextRelevance

sample = SingleTurnSample(
    user_input="What is the implication of histone lysine methylation in medulloblastoma?",
    retrieved_contexts=contexts
)

scorer = ContextRelevance(llm=evaluator_llm)
score = await scorer.single_turn_ascore(sample)
print(score)

In [None]:
from ragas import EvaluationDataset


dataset = []
dataset.append(
        {
            "id": 1,
            "user_input": "What is the implication of histone lysine methylation in medulloblastoma?",
            "retrieved_contexts": contexts,
            "response": "Histone lysine methylation, particularly at H3K9, is implicated in the pathogenesis of medulloblastoma. Copy number aberrations in genes involved in writing, reading, removing, and blocking histone lysine methylation suggest that defective control of the histone code contributes to the development of this cancer. Additionally, the study found that restoration of expression of genes controlling H3K9 methylation greatly diminishes proliferation of medulloblastoma in vitro.",
            "reference": "Aberrant patterns of H3K4, H3K9, and H3K27 histone lysine methylation were shown to result in histone code alterations, which induce changes in gene expression, and affect the proliferation rate of cells in medulloblastoma.", # expected response
        }
    )
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextPrecisionWithReference, ContextRecall, ResponseRelevancy, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embedding = LangchainEmbeddingsWrapper(embedding_model)

context_precision = LLMContextPrecisionWithReference()
context_recall = ContextRecall()
response_relevancy = ResponseRelevancy()
factual_correctness = FactualCorrectness()

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[context_precision, context_recall, response_relevancy, factual_correctness],
    llm=evaluator_llm,
    embeddings=evaluator_embedding
)

result

In [None]:
# read paquet data
import os
import pandas as pd
from configs.config import ConfigPath

from data_collection.reader import BioASQDataReader

In [None]:
asq_reader = BioASQDataReader()
data = asq_reader.read_parquet_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "bioasq_train.parquet"))

In [None]:
for sample in data:
    if 20007090 in sample["relevant_passage_ids"]:
        print(sample)
        print(data.index(sample))
        break

In [None]:
asq_reader.get_data_to_dict()

In [None]:
from data_collection.fetcher import PubMedArticleFetcher

fetcher = PubMedArticleFetcher()

In [None]:
results = fetcher.fetch_articles(pmids=['20007090'])