In [1]:

import os
from pathlib import Path

In [2]:

# --- Choose embedding & LLM backends ---
# Option A: OpenAI for embeddings + OpenAI LLM (requires OPENAI_API_KEY)
# Option B: sentence-transformers for embeddings + any LLM (or local LLM)

USE_OPENAI = False   # set False to use sentence-transformers embeddings


In [3]:
# --- Paths ---
PAPERS_DIR = Path("./papers")
VECTORSTORE_PATH = Path("./faiss_index")  # directory to save vectorstore


In [5]:

# --------------------------
# Imports (LangChain style)
# --------------------------
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

if USE_OPENAI:
    from langchain.embeddings.openai import OpenAIEmbeddings
    from langchain.llms import OpenAI
else:
    from langchain.embeddings import HuggingFaceEmbeddings  # sentence-transformers wrapper
    # for non-OpenAI LLM, you could use e.g., HuggingFaceHub or local LLM integration
    from langchain_groq import ChatGroq
 # placeholder - replace with a local LLM if needed

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


In [14]:
import os
os.environ["GROQ_API_KEY"] = "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"

In [6]:
# --------------------------
# 1) Load PDFs from ./papers
# --------------------------
print("Loading PDFs from", PAPERS_DIR)
loader = PyPDFDirectoryLoader(str(PAPERS_DIR))  # loads each pdf as Document with metadata['source']
raw_docs = loader.load()

print(f"Loaded {len(raw_docs)} source documents (PDF files).")
# Each raw_doc typically contains the whole PDF text in one Document object.
# We'll split into chunks.

Loading PDFs from papers
Loaded 292 source documents (PDF files).


In [7]:
# --------------------------
# 2) Split documents into chunks
# --------------------------
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
texts = splitter.split_documents(raw_docs)
print(f"Split into {len(texts)} chunks.")

# Each chunk keeps metadata including 'source' (filepath). That's important for citations.


Split into 887 chunks.


In [10]:
# --------------------------
# 3) Embed chunks & create VectorStore
# --------------------------
if USE_OPENAI:
    # Requires OPENAI_API_KEY in env
    embeddings = OpenAIEmbeddings()
else:
    # This uses sentence-transformers models under the hood.
    # Common choices: "all-MiniLM-L6-v2" (fast), "all-mpnet-base-v2" (better)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# If there's an existing FAISS index saved, load it.
if VECTORSTORE_PATH.exists():
    print("Loading existing FAISS index from disk...")
    vectordb = FAISS.load_local(
    str(VECTORSTORE_PATH),
    embeddings,
    allow_dangerous_deserialization=True
)
else:
    print("Creating FAISS index (this may take a while for many chunks)...")
    vectordb = FAISS.from_documents(texts, embeddings)
    # save it
    vectordb.save_local(str(VECTORSTORE_PATH))
    print("Saved FAISS index to", VECTORSTORE_PATH)


Loading existing FAISS index from disk...


In [17]:
# --------------------------
# 4) Build a retriever and RAG chain
# --------------------------
# We want the retriever to return multiple docs (top_k) so LLM can synthesize across them.
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Prompt template that instructs LLM to synthesize and cite sources.
SYNTHESIS_PROMPT = """You are an expert research assistant. Use the retrieved document chunks to answer the user's question.
Important rules:
1. Synthesize information from multiple retrieved documents when necessary.
2. You are an academic summarizer. Use the following context to answer the question.
3. If a fact comes from a specific page, include the page number or chunk metadata if available.
4. If the retrieved documents conflict, present both views and say which papers support each view.
5. Be concise and produce a final summary paragraph labeled "Answer".
6. At the end, list the specific sources used (filename + short metadata) under "Sources:".

Context (retrieved chunks):
{context}

User question:
{question}

Answer:
"""

prompt = PromptTemplate(template=SYNTHESIS_PROMPT, input_variables=["context", "question"])

# LLM selection - example OpenAI LLM (gpt-4 style) -- replace with your LLM if needed.
if USE_OPENAI:
    llm = OpenAI(temperature=0.0, model_name="gpt-4o")  # or "gpt-4" if available; choose per access
else:
    # Placeholder: you should replace with your LLM implementation (HFHub, LlamaCpp, etc.)
    llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.3)  # fallback placeholder

# Create a RetrievalQA chain but customized to use our prompt and to return source documents
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",   # "map_reduce" or "refine" are alternatives for large contexts
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)


In [18]:
# --------------------------
# 5) Test queries
# --------------------------
def run_query(q):
    print("\n\n=== QUERY ===")
    print(q)
    result = qa_chain({"query": q})
    answer = result["result"]
    source_docs = result.get("source_documents", [])
    print("\n--- ANSWER ---\n")
    print(answer)
    print("\n--- SOURCES (from retrieved chunks) ---")
    for i, doc in enumerate(source_docs, 1):
        src = doc.metadata.get("source", "unknown")
        # try to print some context about where the chunk came from
        print(f"{i}. {src}  (chunk id: {i}, length {len(doc.page_content)} chars)")
    return result

# Example tests (you will replace titles/names by real filenames/titles)
tests = [
    "Provide a concise summary of the main findings of the paper titled 'Introduction_sleeping.pdf'.",
    "What do the different authors agree on regarding memory consolidation?",
    "Compare the methodologies used in 'Elaine Tham (thesis)_corrections_final.pdf' and 'pone.0042191.pdf'.",
    "What open questions remain in this research area according to these papers?"
]

for t in tests:
    try:
        run_query(t)
    except Exception as e:
        print("Error running query:", e)

print("\nDone.")



=== QUERY ===
Provide a concise summary of the main findings of the paper titled 'Introduction_sleeping.pdf'.

--- ANSWER ---

Unfortunately, the provided context does not include the full paper titled 'Introduction_sleeping.pdf'. However, based on the retrieved document chunks, I can provide a summary of the related findings:

The role of sleep in memory consolidation is a subject of robust scientific inquiry. Research has shown that sleep serves to preserve memory from gradual decay (Jenkins & Dallenbach, 1924) and contributes to memory consolidation (Carskadon & Dement, 2000). Studies have also differentiated the effects of sleep on various types of learning, such as declarative vs. procedural (Plihal & Born, 1997).

The paper 'Introduction_sleeping.pdf' is not explicitly mentioned in the provided context. However, the chunks suggest that the paper may discuss the effects of sleep on memory, particularly the potential weakness of sleep inertia and its effects on post-sleep perform