# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [7]:
from dotenv import load_dotenv # type: ignore
import os
from langchain_neo4j import Neo4jGraph # type: ignore
from libs import create_vector_index
import pandas as pd # type: ignore
from conn import connect2Googlesheet
from typing import List, Tuple, Dict, Optional
# Force reload of the .env file
load_dotenv()

True

In [2]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD")
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

Connected to Neo4j database successfully.


## Step 1: Load Questions from Google Sheet

In [6]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)  

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_MedQ = pd.DataFrame(data)
df_MedQ.head()

Unnamed: 0,condition,number,docs,Question,Mahmud's Note,status,comments,Unnamed: 8
0,ARDS,1,ACURASYS,Does early administration of neuromuscular blo...,Like,,,
1,ARDS,2,ACURASYS,Do patients with severe ARDS being treated wit...,Replace,fixed,,
2,ARDS,3,ROSE,"In patients with moderate to severe ARDS, does...",Maybe this question: In patients with moderate...,fixed,,
3,ARDS,4,ROSE,Do patients with moderate-to-severe ARDS have ...,Local question (not sure if this is the aim of...,fixed,Wrong concept since PEEP by itself is mandator...,Does the use of neuromuscular blockers in pati...
4,ARDS,5,FACTT,"Among patients with ALI/ARDS, does a conservat...",Local question (not sure if this is the aim of...,fixed,Check if studies defined conservative by CVP <...,


## Step 3: Get the 

In [None]:
def get_top_chunks_per_paper(
    graph,
    questions: List[str],
    limit_per_paper: int = 5,
    similarity_threshold: float = 0.5
) -> pd.DataFrame:
    """
    For each question, find the top chunks from each paper based on similarity.
    
    Args:
        graph: Neo4j graph connection
        questions: List of questions to process
        limit_per_paper: Number of top chunks to retrieve per paper (default=5)
        similarity_threshold: Minimum similarity score to consider (default=0.5)
    
    Returns:
        DataFrame with columns: question_number, question, paper_name, chunks, similarities
    """
    results = []
    
    for idx, question in enumerate(questions, 1):
        # Get question embedding
        query_embedding = embed_entity(question)
        
        # Query to get top chunks per paper
        query = """
        MATCH (p:Paper)<-[:FROM_PAPER]-(c:Chunk)
        WITH p, c
        WITH p, c, gds.similarity.cosine(c.embedding, $query_embedding) AS similarity
        WHERE similarity >= $threshold
        WITH p.title AS paper_name, 
             COLLECT({
                 chunk_text: c.text,
                 page: c.page_number,
                 position: c.position,
                 similarity: similarity
             }) AS chunks_data
        WITH paper_name,
             [x IN chunks_data | x ORDER BY x.similarity DESC][..$limit] AS top_chunks
        RETURN paper_name, top_chunks
        """
        
        params = {
            "query_embedding": query_embedding,
            "threshold": similarity_threshold,
            "limit": limit_per_paper
        }
        
        result = graph.query(query, params=params)
        
        # Process results for each paper
        for record in result:
            paper_name = record["paper_name"]
            chunks = record["top_chunks"]
            
            # Extract chunk texts and similarities
            chunk_texts = [chunk["chunk_text"] for chunk in chunks]
            similarities = [chunk["similarity"] for chunk in chunks]
            
            results.append({
                "question_number": idx,
                "question": question,
                "paper_name": paper_name,
                "chunks": chunk_texts,
                "similarities": similarities,
                "avg_similarity": sum(similarities) / len(similarities) if similarities else 0
            })
    
    # Convert to DataFrame
    df_results = pd.DataFrame(results)
    
    # Sort by question number and average similarity
    df_results = df_results.sort_values(
        ["question_number", "avg_similarity"],
        ascending=[True, False]
    )
    
    return df_results