# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [2]:
from dotenv import load_dotenv # type: ignore
import os
from langchain_neo4j import Neo4jGraph # type: ignore
from libs import embed_entity, create_vector_index
import pandas as pd # type: ignore
from conn import connect2Googlesheet
from typing import List, Tuple, Dict, Optional
#import matplotlib.pyplot as plt # type: ignore
# Force reload of the .env file
load_dotenv()

True

In [3]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD")
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

# Check if the entities index exists
index_name = "entities"
query = "SHOW INDEXES YIELD name, type WHERE type = 'VECTOR' AND name = $index_name"

result = graph.query(query, params={"index_name": index_name})
if result:
    print("The 'entities' index already exists.")
else:
    create_vector_index(graph, "entities")

Connected to Neo4j database successfully.
The 'entities' index already exists.


## Step 1: Load Questions from Google Sheet

In [32]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)  

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_Paper = pd.DataFrame(data)
df_Paper.head()

Unnamed: 0,condition,number,docs,Question,Mahmud's Note,status,comments,Unnamed: 8
0,ARDS,1,ACURASYS,Does early administration of neuromuscular blocking agents increases the ventilator free days?,Like,,,
1,ARDS,2,ACURASYS,Do patients with severe ARDS being treated with neuromuscular blocking agents have increased muscle weakness?,Replace,fixed,,
2,ARDS,3,ROSE,"In patients with moderate to severe ARDS, does early use of continuous neuromuscular blockade improve mortality?","Maybe this question: In patients with moderate to severe ARDS, does early use of continuous neuromuscular blockade improve mortality?",fixed,,
3,ARDS,4,ROSE,Do patients with moderate-to-severe ARDS have a significance difference in mortality rate beween patients who recieved an early and continous cisatracurium infusion than those with usual care approach with lighter sedation targets?,Local question (not sure if this is the aim of your project) It will be nice as second step after proving the general summarization is working but focusing in general summarization would be priority in my opinion so you can have meanigful tool.,fixed,Wrong concept since PEEP by itself is mandatory component in ventilator.,"Does the use of neuromuscular blockers in patients with moderate-to-severe ARDS impact cardiovascular stability, particularly in terms of vasopressor requirements and hemodynamic effects, compared to sedation strategy without routine neuromuscular blockade?"
4,ARDS,5,FACTT,"Among patients with ALI/ARDS, does a conservative fluid management strategy improves lung function and decrease ventilator days compared to liberal strategy?","Local question (not sure if this is the aim of your project) consider (WikiJournal): In patients with ALI/ARDS that are intubated and receiving positive pressure ventilation, how does the conservative compare to the liberal fluid management strategy in reducing mortality?",fixed,Check if studies defined conservative by CVP < 4 or elese just dont mention how much the CVP (i prefer the last approach),


## Step 3: Get the chunks from each paper

In [54]:
def get_all_chunks_per_paper(
    graph,
    paperNames: List[str],  # Changed parameter name for clarity
) -> Dict[str, pd.DataFrame]:
    """
    Get all chunks for each paper and save them as separate CSV files.
    
    Args:
        graph: Neo4j graph connection
        paperNames: List of paper names without .pdf extension
    
    Returns:
        Dictionary of DataFrames, one for each paper
    """
    paper_dataframes = {}  # Store DataFrames for each paper
    
    for idx, paper in enumerate(paperNames):
        results = []  # Reset results for each paper
        
        query = """
            MATCH (c:Chunk)
            WHERE c.fileName = $paperName + ".pdf"
            ORDER BY c.position 
            RETURN c.fileName AS paper_name, 
                   c.position AS position,
                   c.text AS chunk_text, 
                   c.embedding AS chunk_embedding
        """
        
        params = {"paperName": paper}
        result = graph.query(query, params=params)
        print(f"Found {len(result)} chunks in paper {paper}")
        
        # Process results for this paper
        for record in result:
            results.append({
                "paper_num": idx + 1,
                "paper_name": record["paper_name"],
                "position": record["position"],
                "chunk_text": record["chunk_text"],
                "chunk_embedding": record["chunk_embedding"]
            })
        
        # Create DataFrame for this paper
        df_paper = pd.DataFrame(results)
        
        # Save to CSV if we have results
        if not df_paper.empty:
            output_dir = "./chunks_of_paper"
            os.makedirs(output_dir, exist_ok=True)
            csv_path = f"{output_dir}/chunks_of_{paper}.csv"
            df_paper.to_csv(csv_path, index=False)
            print(f"Saved {csv_path}")
        
        # Store DataFrame in dictionary
        paper_dataframes[paper] = df_paper
    
    return paper_dataframes


In [55]:
# Usage example
papers = sorted(list(set([doc.strip() for doc in df_Paper['docs'].tolist()])))
paper_dfs = get_all_chunks_per_paper(graph, papers[:4])

# Access individual paper DataFrames
# for paper, df in paper_dfs.items():
#     print(f"\nPreview of {paper}:")
#     display(df.head())


Found 100 chunks in paper ACURASYS
Saved ./chunks_of_paper/chunks_of_ACURASYS.csv
Found 100 chunks in paper ADRENAL
Saved ./chunks_of_paper/chunks_of_ADRENAL.csv
Found 100 chunks in paper AID-ICU
Saved ./chunks_of_paper/chunks_of_AID-ICU.csv
Found 100 chunks in paper ALBIOS
Saved ./chunks_of_paper/chunks_of_ALBIOS.csv


## Step 4: Compare Question Embedding and Paper Chunk Embeddings

In [52]:
import numpy as np
import os
import ast

In [56]:
def compare_embeddings(question: str, paper: str, top_k: int = 5) -> pd.DataFrame:
    """
    Compare question embedding with chunk embeddings from CSV files.
    
    Args:
        question: Question text to compare against
        paper: Paper name without .pdf extension
        top_k: Number of top matches to return
    
    Returns:
        DataFrame with top matching chunks and similarity scores
    """
    # Load chunk embeddings from CSV
    csv_path = f"./chunks_of_paper/chunks_of_{paper}.csv"
    if not os.path.exists(csv_path):
        print(f"No chunk data found for paper: {paper}")
        return pd.DataFrame()
    
    # Load chunks DataFrame
    chunks_df = pd.read_csv(csv_path)
    
    # Get question embedding
    question_embedding = embed_entity(question)
    
    # Convert string embeddings back to lists
    chunks_df['chunk_embedding'] = chunks_df['chunk_embedding'].apply(ast.literal_eval)
    
    # Calculate cosine similarities
    similarities = chunks_df['chunk_embedding'].apply(
        lambda x: np.dot(x, question_embedding) / 
        (np.linalg.norm(x) * np.linalg.norm(question_embedding))
    )
    
    # Add similarities to DataFrame
    chunks_df['similarity_score'] = similarities.round(4)
    
    # Sort and get top k results
    results = chunks_df.nlargest(top_k, 'similarity_score')[
        ['paper_name', 'position', 'chunk_text','similarity_score']
    ]
    
    return results

# Example usage
# import numpy as np
# import os
# import ast

test_question = df_Paper['Question'].iloc[0]
paper_name = "ACURASYS"  # without .pdf extension
results = compare_embeddings(
    question=test_question,
    paper=paper_name,
    top_k=5
)
display(results)

Unnamed: 0,paper_name,position,chunk_text,similarity_score
63,ACURASYS.pdf,64,"Neuromuscular Blocking Agents in ARDS n engl j med 363;12 nejm.org september 16, 2010 1111 without the aid of a ventilator, for a period of at least 48 hours for weaning from the ventilator to be considered successful. The number of ventilator- free days was considered to be zero for patients who were weaned from mechanical ventilation but who died before day 28 or",0.7262
34,ACURASYS.pdf,35,"Neuromuscular Blocking Agents in ARDS n engl j med 363;12 nejm.org september 16, 2010 1109 sure were adjusted as in the Prospective, Random- ized, Multi-Center Trial of 12 ml/kg Tidal Volume Positive Pressure Ventilation for Treatment of Acute Lung Injury and Acute Respiratory Distress Syndrome (ARMA).14 An open-label, rapid, intravenous injection",0.6263
10,ACURASYS.pdf,11,"mechanical ventilation for the acute respiratory distress syn- drome (ARDS), neuromuscular blocking agents may improve oxygenation and de- crease ventilator-induced lung injury but may also cause muscle weakness. We evalu- ated clinical outcomes after 2 days of therapy with neuromuscular blocking agents in patients with early, severe ARDS. Methods In this multicenter, double-blind trial, 340 patients presenting to the intensive care unit (ICU) with an onset",0.6136
22,ACURASYS.pdf,23,"did not receive a blocking agent; P = 0.06). However, this study was not designed or powered to evaluate mortality. Thus, the benefits and risks of adjunctive therapy with neuromuscu- lar blocking agents in patients with ARDS who were receiving lung-protective mechanical venti- lation14 require further evaluation. We conducted a multicenter, randomized, pla- cebo-controlled, double-blind trial to determine",0.612
16,ACURASYS.pdf,17,"�= 0.05). The rate of ICU-acquired paresis did not differ significantly between the two groups. Conclusions In patients with severe ARDS, early administration of a neuromuscular blocking agent improved the adjusted 90-day survival and increased the time off the ventilator without increasing muscle weakness. (Funded by Assistance Publique–Hôpitaux de Marseille and the Programme Hospitalier de Recherche Clinique Rég",0.5906
