# Get the Top 5 Chunks to Check Paper's Stance

## Step 0: Environment Setup

In [None]:
from dotenv import load_dotenv # type: ignore
import os
from langchain_neo4j import Neo4jGraph # type: ignore
from libs import create_vector_index
import pandas as pd # type: ignore
from conn import connect2Googlesheet
from annolibs import get_all_chunks_per_paper, compare_embeddings
load_dotenv()

In [None]:
# Connect to Neo4j database
try:
    graph = Neo4jGraph(
        url=os.getenv("NEO4J_URL"),
        username=os.getenv("NEO4J_USERNAME"),
        password=os.getenv("NEO4J_PASSWORD")
    )
    print("Connected to Neo4j database successfully.")
except ValueError as e:
    print(f"Could not connect to Neo4j database: {e}")

# Check if the entities index exists
index_name = "entities"
query = "SHOW INDEXES YIELD name, type WHERE type = 'VECTOR' AND name = $index_name"

result = graph.query(query, params={"index_name": index_name})
if result:
    print("The 'entities' index already exists.")
else:
    create_vector_index(graph, "entities")

## Step 1: Load Questions from Google Sheet

In [None]:
spreadsheet = connect2Googlesheet()

# Select the worksheet: relevance
worksheet = spreadsheet.get_worksheet(2)  

# Get all records as a list of dictionaries
data = worksheet.get_all_records()

# Convert to Pandas DataFrame
df_Paper = pd.DataFrame(data)
df_Paper.head()

## Step 3: Get the chunks from each paper

In [None]:
# Usage example
#papers = sorted(list(set([doc.strip() for doc in df_Paper['docs'].tolist()])))

# Filter papers with 'Delirium' in condition
delirium_papers = df_Paper[df_Paper['condition'].str.contains('Delirium', case=False, na=False)]

# Display the filtered papers
print(f"Found {len(delirium_papers)} papers related to Delirium:")
display(delirium_papers)

# Get unique paper names
delirium_paper_names = delirium_papers['docs'].str.strip().unique()
# print(f"Number of unique paper names: {len(delirium_paper_names)}")
delirium_papers_chunks = get_all_chunks_per_paper(graph, delirium_paper_names)

In [None]:
# check if the expected number of papers match the actual number of papers
# Get list of expected papers
expected_papers = delirium_paper_names.tolist()

# Get list of actual papers from chunks_of_paper directory
actual_papers = [f.replace('chunks_of_', '').replace('.csv', '') 
                 for f in os.listdir('./chunks_of_paper') 
                 if f.endswith('.csv')]

# Find missing papers
missing_papers = set(expected_papers) - set(actual_papers)

print("Missing papers:")
for paper in missing_papers:
    print(f"- {paper}")

## Step 4: Compare Question Embedding and Paper Chunk Embeddings

In [None]:
# Set pandas display options to show the full text content
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

idx = 0
test_question = df_Paper['Question'].iloc[idx]
print(f"Question {idx+1}: {test_question}")

# for i, paper in enumerate(delirium_paper_names):
#     print(f"\nPaper {i+1}: {paper}")
#     paper_name = str(paper)  # without .pdf extension
#     top5chunks = compare_embeddings(
#         question=test_question,
#         paper=paper_name,
#         top_k=5
#     )
top5chunks = compare_embeddings(
        question=test_question,
        paper='AID-ICU',
        top_k=5
    )