# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [1]:
from dotenv import load_dotenv # type: ignore
import os
from langchain_neo4j import Neo4jGraph # type: ignore
from libs import embed_entity
import pandas as pd # type: ignore
from conn import connect2Googlesheet
from typing import List, Tuple, Dict, Optional
# Force reload of the .env file
load_dotenv()

True

In [2]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD")
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

Connected to Neo4j database successfully.


## Step 1: Load Questions from Google Sheet

In [3]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)  

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_MedQ = pd.DataFrame(data)
df_MedQ.head()

Unnamed: 0,condition,number,docs,Question,Mahmud's Note,status,comments,Unnamed: 8
0,ARDS,1,ACURASYS,Does early administration of neuromuscular blo...,Like,,,
1,ARDS,2,ACURASYS,Do patients with severe ARDS being treated wit...,Replace,fixed,,
2,ARDS,3,ROSE,"In patients with moderate to severe ARDS, does...",Maybe this question: In patients with moderate...,fixed,,
3,ARDS,4,ROSE,Do patients with moderate-to-severe ARDS have ...,Local question (not sure if this is the aim of...,fixed,Wrong concept since PEEP by itself is mandator...,Does the use of neuromuscular blockers in pati...
4,ARDS,5,FACTT,"Among patients with ALI/ARDS, does a conservat...",Local question (not sure if this is the aim of...,fixed,Check if studies defined conservative by CVP <...,


## Step 3: Get the 

In [24]:
def get_top_chunks_per_paper(
    graph,
    questions: List[str],
    limit_per_paper: int = 5,
    similarity_threshold: float = 0.5
) -> pd.DataFrame:
    """
    For each question, find the top chunks from each paper based on similarity.
    
    Args:
        graph: Neo4j graph connection
        questions: List of questions to process
        limit_per_paper: Number of top chunks to retrieve per paper (default=5)
        similarity_threshold: Minimum similarity score to consider (default=0.5)
    
    Returns:
        DataFrame with columns: question_number, question, paper_name, chunks, similarities
    """
    results = []
    
    for idx, question in enumerate(questions):
        # Get question embedding
        query_embedding = embed_entity(question)
        print(f"Processing question {idx + 1}: {question}")
        #print(f"Query embedding: {query_embedding}")
        params = {
            "query_embedding": query_embedding,
            "threshold": similarity_threshold,
            "limit": limit_per_paper
        }

        # Query to get top chunks per paper
        query = """
            MATCH (c:Chunk)
            WHERE c.fileName = "ACURASYS.pdf"
            CALL db.index.vector.queryNodes("entities", 50, $query_embedding)
            YIELD node, score
            WHERE score >= $threshold
            ORDER BY score DESC
            LIMIT 5
            RETURN
                c.fileName as paper_name, 
                c.text as chunk_text,
                c.position as position,
                c.page_number as page,
                score
            
        """
        
        result = graph.query(query, params=params)
        print(result)
        # Process results
        # for record in result:
        #     chunk = record["chunk"]  # Now chunk is a dictionary
        #     results.append({
        #         "question_number": idx,
        #         "question": question,
        #         "paper_name": "ACURASYS.pdf",
        #         "chunk_text": chunk["chunk_text"],
        #         "position": chunk["position"],
        #         "page": chunk["page_number"],
        #         "score": chunk["score"]
        #     })
    
    # Convert to DataFrame
    #df_results = pd.DataFrame(results)
    
    # Sort by question number and average similarity
    # df_results = df_results.sort_values(
    #     ["question_number", "avg_similarity"],
    #     ascending=[True, False]
    # )
    
    return #df_results.sort_values(["question_number", "score"], ascending=[True, False])

In [25]:
# Get first two questions using proper DataFrame indexing
test_questions = df_MedQ.iloc[10:12]['Question'].tolist()

# # Print selected questions to verify
# print("\nSelected questions:")
# for i, q in enumerate(test_questions, 1):
#     print(f"Q{i}: {q}")

# Call the function with correct parameters
df_results = get_top_chunks_per_paper(
    graph=graph,
    questions=test_questions
)


Processing question 1: Patients with septic shock undergoing mechanical ventilation, did continuous infusion of hydrocortisone result in lower 90-day mortality?
[{'paper_name': 'ACURASYS.pdf', 'chunk_text': 'D., Didier Perez, M.D.,   Jean-Marie Seghboyan, M.D., Jean-Michel Constantin, M.D., Ph.D., Pierre Courant, M.D., Jean-Yves Lefrant, M.D., Ph.D.,   Claude Guérin, M.D., Ph.D., Gwenaël Prat, M.D., Sophie Morange, M.D., and Antoine R', 'position': 3, 'page': 1, 'score': 0.8569455146789551}, {'paper_name': 'ACURASYS.pdf', 'chunk_text': 'l Prat, M.D., Sophie Morange, M.D., and Antoine Roch, M.D., Ph.D.,   for the ACURASYS Study Investigators* Abstr act From Assistance Publique–Hôpitaux de  Marseille Unité de Recherche sur les Mal- adies Infectieuses et Tropicales Émer- gentes (URMITE), Centre National de la  Recherche Scientifique–Unité Mix', 'position': 4, 'page': 1, 'score': 0.8569455146789551}, {'paper_name': 'ACURASYS.pdf', 'chunk_text': 'n engl j med 363;12\u2003 nejm.org\u2003 sep

In [None]:
df_results = get_top_chunks_per_paper(graph, df_MedQ.iloc[:4]["Question"].tolist())
df_results.head()

Processing question 1: Does early administration of neuromuscular blocking agents increases the ventilator free days?
Found 5 results.
Paper: ACURASYS.pdf
Chunks: D., Didier Perez, M.D.,   Jean-Marie Seghboyan, M.D., Jean-Michel Constantin, M.D., Ph.D., Pierre Courant, M.D., Jean-Yves Lefrant, M.D., Ph.D.,   Claude Guérin, M.D., Ph.D., Gwenaël Prat, M.D., Sophie Morange, M.D., and Antoine R


TypeError: string indices must be integers, not 'str'