# DataDoc (Research-only)

## 1. Search for MVP (research papers)

In [30]:
from gettext import install

import os
import json
import requests
import pandas as pd
import time
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt

from datetime import datetime

pd.set_option("display.max_colwidth", 120)

In [31]:
# Optional but recommended: reuse one session (faster + fewer TCP handshakes)
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "DataDoc/1.0 (research project; contact: alina.skultecka@gmail.com"})

#### PubMed Central search: get PMCIDs

In [32]:
def pmc_esearch(query: str, retmax: int = 300, sort: str = "relevance",
               mindate: str = "2015", maxdate: str = "2025"):
    """
    Search the PMC database and return a list of PMC numeric IDs (as strings).

    Args:
        query (str): Search query (PMC/PubMed syntax).
        retmax (int): Max number of IDs to return.
        sort (str): 'relevance' or 'date' (and other supported sorts).
        mindate (str): Minimum publication date/year (e.g. '2015').
        maxdate (str): Maximum publication date/year (e.g. '2025').

    Returns:
        list[str]: PMC IDs.
    """
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pmc",
        "term": query,
        "retmax": retmax,
        "retmode": "json",
        "sort": sort,
        "datetype": "pdat",
        "mindate": mindate,
        "maxdate": maxdate,
    }
    r = SESSION.get(url, params=params, timeout=60)
    r.raise_for_status()
    return r.json().get("esearchresult", {}).get("idlist", [])

#### PMC fetch details + abstracts (EFetch XML)

In [26]:
# Splits a big list into smaller pieces (batches).
def _chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

In [27]:
def pmc_esummary(pmc_ids, chunk_size: int = 200, sleep_s: float = 0.12):
    """
    Get basic metadata for PMC IDs using ESummary.
    Returns a DataFrame with: pmc_id, pmcid, title, pubdate, journal, authors
    """
    if not pmc_ids:
        return pd.DataFrame(columns=["pmc_id","pmcid","title","pubdate","journal","authors"])

    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    rows = []

    # Fix: chunk IDs so the URL never gets too long (prevents HTTP 414)
    pmc_ids = [str(x) for x in pmc_ids]

    for batch in _chunks(pmc_ids, chunk_size):
        params = {
            "db": "pmc",
            "id": ",".join(batch),
            "retmode": "json",
        }
        r = SESSION.get(url, params=params, timeout=60)
        r.raise_for_status()
        data = r.json()
        result = data.get("result", {})

        for pmc_id in batch:
            item = result.get(str(pmc_id), {})
            if not item:
                continue

            title = item.get("title", "")
            pubdate = item.get("pubdate", "")
            journal = item.get("fulljournalname", "") or item.get("source", "")
            authors = ", ".join(
                [a.get("name", "") for a in item.get("authors", []) if a.get("name")]
            )

            # Many PMC records also have an "articleIDs" list (sometimes includes "pmcID")
            pmcid = ""
            for aid in item.get("articleids", []):
                if aid.get("idtype") == "pmcid":
                    pmcid = aid.get("value", "")
                    break

            # Fallback: if pmcid is missing, it is usually "PMC" + pmc_id for OA items
            if not pmcid:
                pmcid = f"PMC{pmc_id}"

            rows.append({
                "pmc_id": str(pmc_id),
                "pmcid": pmcid,
                "title": title,
                "pubdate": pubdate,
                "journal": journal,
                "authors": authors,
                "url": f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/",
            })

        # polite rate limiting (after each batch)
        if sleep_s:
            time.sleep(sleep_s)

    # return AFTER all batches are processed
    return pd.DataFrame(rows)

#### PMC PDF link finder

In [28]:
def oa_pdf_links_from_pmcid(pmcids, chunk_size: int = 50, sleep_s: float = 0.0):
    """
    Batch OA (Open Access) service lookup for direct PDF links.
    Returns:
      - if input is a single pmcid string -> pdf_url_or_None
      - if input is list-like -> dict {pmcid: pdf_url_or_None}
    """
    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
    out = {}

    # IMPORTANT: if a single "PMC123" is passed, wrap it in a list
    single = isinstance(pmcids, str)
    if single:
        pmcids = [pmcids]

    pmcids = [str(x) for x in pmcids if str(x).strip()]

    for batch in _chunks(pmcids, chunk_size):
        r = SESSION.get(url, params={"id": ",".join(batch)}, timeout=30)
        r.raise_for_status()
        root = ET.fromstring(r.text)

        # records look like <record id="PMCxxxx"> ... <link format="pdf" href="..."/>
        for rec in root.findall(".//record"):
            rid = rec.attrib.get("id", "")
            pdf = None
            for link in rec.findall(".//link"):
                if link.attrib.get("format") == "pdf":
                    pdf = link.attrib.get("href")
                    break
            if rid:
                out[rid] = pdf

        # Some IDs may have no record if not OA; set them explicitly
        for pmcid in batch:
            out.setdefault(pmcid, None)

        if sleep_s:
            time.sleep(sleep_s)

    # If user passed one pmcid, return one value (not dict)
    return out.get(pmcids[0]) if single else out

#### Run search + show results

In [29]:
query = (
'("Metabolic Syndrome"[MeSH Terms] OR "metabolic syndrome"[TIAB])'
'AND ("Alcohol Drinking"[MeSH Terms] OR alcohol[TIAB] OR "alcohol consumption"[TIAB] OR ethanol[TIAB]) '
'AND ('
'("Fatty Liver"[MeSH Terms] OR NAFLD[TIAB] OR "non-alcoholic fatty liver"[TIAB] OR fibrosis[TIAB] OR cirrhosis[TIAB] OR "liver disease"[TIAB]) '
'OR '
'("Cardiovascular Diseases"[MeSH Terms] OR CVD[TIAB] OR "myocardial infarction"[TIAB] OR stroke[TIAB] OR hypertension[TIAB])'
') '
'AND (Sweden[TIAB] OR Swedish[TIAB] OR Scandinavia[TIAB] OR Scandinavian[TIAB] OR Nordic[TIAB])'
)
pmc_ids = pmc_esearch(query, retmax=300)

df = pmc_esummary(pmc_ids)

# Add PDF links (OA only)
df["pdf_url"] = df["pmcid"].apply(oa_pdf_links_from_pmcid)

# Keep only rows that really have a PDF
df_with_pdf = df[df["pdf_url"].notna() & (df["pdf_url"] != "")].reset_index(drop=True)

df_with_pdf


KeyboardInterrupt



#### Save results to JSON


In [69]:
out_path = "results.json"
df_with_pdf.to_json(out_path, orient="records", force_ascii=False, indent=2)
print("Saved to:", out_path)


Saved to: results.json


## 2. Download research papers

#### Imports + load your results

- Loads results.json into a pandas DataFrame (df_final)
- head() just shows the first rows (preview)

In [33]:
import os
import re
import requests
import pandas as pd

df_final = pd.read_json("results.json")
df_final.head()

Unnamed: 0,pmc_id,pmcid,title,pubdate,journal,authors,url,pdf_url
0,8514420,PMC8514420,Non-alcoholic fatty liver disease: A patient guideline,2021 Sep 17,JHEP Reports,"Francque SM, Marchesini G, Kautz A, Walmsley M, Dorner R, Lazarus JV, Zelber-Sagi S, Hallsworth K, Busetto L, Frühbe...",https://pmc.ncbi.nlm.nih.gov/articles/PMC8514420/,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/35/cd/main.PMC8514420.pdf
1,11299976,PMC11299976,EASL-EASD-EASO Clinical Practice Guidelines on the Management of Metabolic Dysfunction-Associated Steatotic Liver Di...,2024 Jun 7,Obesity Facts,"European Association for the Study of the Liver (EASL)*, European Association for the Study of Diabetes (EASD), Euro...",https://pmc.ncbi.nlm.nih.gov/articles/PMC11299976/,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ca/74/ofa-2024-0017-0004-539371.PMC11299976.pdf
2,6796246,PMC6796246,UEG Week 2018 Poster Presentations,2018 Oct 21,United European Gastroenterology Journal,,https://pmc.ncbi.nlm.nih.gov/articles/PMC6796246/,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/c1/97/10.1177_2050640618792819.PMC6796246.pdf
3,10588738,PMC10588738,TASL Practice Guidance on the Clinical Assessment and Management of Patients with Nonalcoholic Fatty Liver Disease,2023 Mar 15,Hepatology Forum,"Yilmaz Y, Zeybel M, Adali G, Cosar AM, Sertesen E, Gokcan H, Bahcecioglu HI, Sahin M, Tulunay C, Ergun I, Turan I, I...",https://pmc.ncbi.nlm.nih.gov/articles/PMC10588738/,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/4d/4f/hf-4-s1.PMC10588738.pdf
4,8771615,PMC8771615,Non-alcoholic fatty liver disease and hepatocellular carcinoma: Clinical challenges of an intriguing link,2022 Jan 21,World Journal of Gastroenterology,"Chrysavgis L, Giannakodimos I, Diamantopoulou P, Cholongitas E",https://pmc.ncbi.nlm.nih.gov/articles/PMC8771615/,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/12/03/WJG-28-310.PMC8771615.pdf


#### Takes a text (usually pmcid - title) and cleans it so Windows accepts it as a filename

- Helper: make a Windows-safe filename
- Purpose: turn something like
PMC123456 - A Study: alcohol/NAFLD?
into a filename Windows allows.

In [71]:
def _safe_filename(text: str, max_len: int = 140) -> str:
    """Make a Windows-safe filename."""
    if text is None:
        return "paper"
    text = str(text).strip()
    text = re.sub(r'[\\/:*?"<>|]+', "_", text)   # Windows illegal chars
    text = re.sub(r"\s+", " ", text)
    return text[:max_len].strip()

#### Main function: download PDFs for all rows in my DataFrame

In [72]:
def download_pdfs(df: pd.DataFrame, download_folder: str = "files",
                  url_col: str = "pdf_url", id_col: str = "pmcid",
                  title_col: str = "title"):
    """
    Download PDFs from URLs in a DataFrame and save them to a folder.
    Adds a new column 'pdf_file_name' with the saved file path (or None if failed).

    Args:
        df (pd.DataFrame): Must contain a column with PDF URLs (default: 'pdf_url').
        download_folder (str): Folder where PDFs will be saved.
        url_col (str): Name of the column containing PDF URLs.
        id_col (str): Column to use for file naming (e.g. 'pmcid' or 'paper_id').
        title_col (str): Column to use for file naming (optional).

    Returns:
        pd.DataFrame: Same df with new column 'pdf_file_name'.
    """

    # Check that the DataFrame has the pdf column
    if url_col not in df.columns:
        raise ValueError(f"DataFrame must contain column '{url_col}'")

    os.makedirs(download_folder, exist_ok=True)

    pdf_file_names = []
    headers = {"User-Agent": "DataDoc/1.0 (research project)"}

    for _, row in df.iterrows():
        pdf_link = row.get(url_col)
        pdf_link = str(pdf_link).strip()
        if pdf_link.startswith("ftp://ftp.ncbi.nlm.nih.gov/"):
            pdf_link = pdf_link.replace("ftp://ftp.ncbi.nlm.nih.gov/", "https://ftp.ncbi.nlm.nih.gov/")

        # Skip rows without a link
        if pd.isna(pdf_link) or not str(pdf_link).strip():
            pdf_file_names.append(None)
            continue

        # Build a nice filename
        paper_id = row.get(id_col) or row.get("paper_id") or row.get("pmid") or "paper"
        title = row.get(title_col) or ""
        base = _safe_filename(f"{paper_id} - {title}" if title else str(paper_id))
        file_name = os.path.join(download_folder, base + ".pdf")

        try:
            # Download (stream so large PDFs don't eat RAM)
            r = requests.get(str(pdf_link), headers=headers, stream=True, timeout=60, allow_redirects=True)
            r.raise_for_status()

            # Optional check: confirm it's a PDF
            content_type = (r.headers.get("Content-Type") or "").lower()
            if "pdf" not in content_type:
                pdf_file_names.append(None)
                print(f"Skipped (not PDF content-type): {pdf_link}")
                continue

            with open(file_name, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 64):
                    if chunk:
                        f.write(chunk)

            pdf_file_names.append(file_name)
            print(f"Downloaded: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed: {pdf_link} -> {e}")
            pdf_file_names.append(None)

    df = df.copy()
    df["pdf_file_name"] = pdf_file_names
    return df


In [180]:
df_final = download_pdfs(df_final, download_folder="data", url_col="pdf_url", id_col="pmcid", title_col="title")
df_final[["pmcid", "title", "pdf_url", "pdf_file_name"]].head()

NameError: name 'download_pdfs' is not defined

Saving metadata about our papers to be able to:
- resume work tomorrow without re-downloading,
- debugging (see which PDFs downloaded/failed),
- reproducibility (a frozen input to the chunking step).

In [35]:
df_final.to_json("data/results_with_files.json", orient="records", indent=2)
print("Saved:", os.path.abspath("data/results_with_files.json"))

Saved: C:\Users\grigo\Desktop\Python_Tasks\DataDoc\data\results_with_files.json


## 3. Loading and Splitting PDF Files into Chunks, Expanding the DataFrame

Load_and_chunk_pdf works on one PDF file and returns a list of text chunks
- Takes a path to one PDF.
- Loads the PDF using LangChain’s PyPDFLoader.
- Splits the PDF text into chunks using RecursiveCharacterTextSplitter.
- docs becomes a list of LangChain Document objects.

In [36]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_and_chunk_pdf(pdf_file_name: str, chunk_size: int = 512, chunk_overlap: int = 64):
    """
    Loads a PDF file and splits its content into chunks.

    Args:
        pdf_file_name (str): Path to the PDF file.
        chunk_size (int): Max characters per chunk.
        chunk_overlap (int): Overlap between chunks to preserve context.

    Returns:
        list: List of LangChain Document chunks.
    """
    print(f"Loading and splitting into chunks: {pdf_file_name}")

    loader = PyPDFLoader(pdf_file_name)
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    return chunks

Expand_df works on the whole dataframe of many papers, calls load_and_chunk_pdf for each PDF, and turns all those chunks into a new dataframe where each chunk becomes its own row with metadata (pmcid/title/page/links to neighbor chunks).

In [37]:
def expand_df(df: pd.DataFrame,
              id_col: str = "pmcid",
              fallback_id_col: str = "paper_id",
              pdf_col: str = "pdf_file_name"):
    """
    Expand each paper row into many rows (one per text chunk from the PDF).

    Requires:
      - a PDF file path column (default: 'pdf_file_name')
      - an ID column for stable chunk ids (default: 'pmcid'; falls back to 'paper_id')
      - load_and_chunk_pdf(pdf_path) must already exist

    Output columns:
      - id:        unique chunk id like "PMC1234567#12"
      - title, authors, year, journal, abstract, url: copied from the paper row (if present)
      - chunk:     the chunk text
      - prechunk_id / postchunk_id: links to adjacent chunks
      - source_file: original PDF path
    """
    expanded_rows = []

    for _, row in df.iterrows():
        pdf_path = row.get(pdf_col)

        if pd.isna(pdf_path) or not str(pdf_path).strip():
            continue

        # normalize path
        pdf_path = os.path.normpath(str(pdf_path).strip())

        # make it absolute (based on current notebook folder)
        pdf_path = os.path.abspath(pdf_path)

        # verify it exists
        if not os.path.isfile(pdf_path):
            print(f"Missing file on disk: {pdf_path}")
            continue

        # Choose a stable id for chunk naming (PMC8514420#0, PMC8514420#1)
        doc_id = row.get(id_col)
        if pd.isna(doc_id) or not str(doc_id).strip():
            doc_id = row.get(fallback_id_col)
        doc_id = str(doc_id).strip() if doc_id else "paper"

        try:
            chunks = load_and_chunk_pdf(pdf_path)
        except Exception as e:
            print(f"Error processing file {pdf_path}: {e}")
            continue

        for i, chunk in enumerate(chunks):
            pre_id = f"{doc_id}#{i-1}" if i > 0 else ""
            post_id = f"{doc_id}#{i+1}" if i < len(chunks) - 1 else ""

            expanded_rows.append({
                "id": f"{doc_id}#{i}",
                "doc_id": doc_id,                 # helpful for grouping
                "paper_id": row.get("paper_id", None),
                "pmcid": row.get("pmcid", None),
                "title": row.get("title", None),
                "authors": row.get("authors", None),
                "year": row.get("year", None),
                "journal": row.get("journal", None),
                "abstract": row.get("abstract", None),
                "url": row.get("url", None),
                "chunk": chunk.page_content,
                "page": chunk.metadata.get("page", None),
                "prechunk_id": pre_id,
                "postchunk_id": post_id,
                "source_file": pdf_path,
            })

    return pd.DataFrame(expanded_rows)

In [38]:
df_final = pd.read_json("data/results_with_files.json")

df_chunks = expand_df(df_final, pdf_col="pdf_file_name")

print("Total chunks:", len(df_chunks))
df_chunks.head()

Total chunks: 0


## 4.Building a Knowledge Base for the RAG System Using Embedding

In [39]:
from dotenv import load_dotenv, find_dotenv

# Load the API keys from .env
load_dotenv(find_dotenv(), override=True)

True

In [40]:
import os
from getpass import getpass

from semantic_router.encoders import OpenAIEncoder

# Check if 'OPENAI_API_KEY' is set; prompt if not
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') or getpass('OpenAI API key: ')

# Initialize the OpenAIEncoder with a specific model, does the embedding
encoder = OpenAIEncoder(name='text-embedding-3-small')

#### Creating a Pinecone Index

In [41]:
import os
from getpass import getpass

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec  # keep this if you create indexes elsewhere

# Get API key
api_key = os.getenv("PINECONE_API_KEY") or getpass("Pinecone API key: ")

# Initialize Pinecone (gRPC data-plane client)
pc = Pinecone(api_key=api_key)

# If you still need spec for CREATE index calls, keep it:
spec = ServerlessSpec(cloud="aws", region="us-east-1")


In [42]:
import time

# Define the name of the index
index_name = 'langgraph-research-agent'
dims = 1536  # <- must match your embedding vector length

pc = Pinecone(api_key=api_key)

# List index names (helper exists in the SDK docs)
existing = pc.list_indexes().names()  # if these errors, see note below

if index_name not in existing:
    pc.create_index(
        name=index_name,
        dimension=dims,
        metric="cosine",
        spec=spec
    )
    # Wait until ready
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

# Connect to the index
index_desc = pc.describe_index(index_name)
index = pc.Index(host=index_desc.host)

In [43]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'pmc': {'vector_count': 52829}},
 'total_vector_count': 52829}

## 5. Populating the Knowledge Base and Uploading it to Pinecone

Sanitize metadata strings before upsert, because one of your metadata strings (or a “chunk” you stored in metadata) contains fancy Unicode punctuation like:

- en-dash – / em-dash —
- smart quotes “ ” ‘ ’
- ellipsis …

In [44]:
import unicodedata
import pandas as pd

LIGATURES = {
    "\ufb00": "ff",
    "\ufb01": "fi",
    "\ufb02": "fl",
    "\ufb03": "ffi",
    "\ufb04": "ffl",
    "\ufb05": "ft",
    "\ufb06": "st",
}

# Characters that commonly break latin-1 encoding or cause weird request bodies
PUNCT_MAP = {
    "\u2010": "-",  # hyphen
    "\u2011": "-",  # non-breaking hyphen
    "\u2012": "-",  # figure dash
    "\u2013": "-",  # en-dash  ← your error
    "\u2014": "-",  # em-dash
    "\u2212": "-",  # minus sign
    "\u2018": "'",  # left single quote
    "\u2019": "'",  # right single quote
    "\u201B": "'",  # single high-reversed-9 quote
    "\u201C": '"',  # left double quote
    "\u201D": '"',  # right double quote
    "\u201E": '"',  # double low-9 quote
    "\u00A0": " ",  # non-breaking space
    "\u2026": "...",# ellipsis
}

def normalize_text(s: object, *, latin1_safe: bool = True, max_len: int | None = None) -> str:
    """
    Normalize text for embeddings + Pinecone metadata.
    - Fix ligatures
    - Convert problematic punctuation to ASCII
    - Unicode normalize (NFKC)
    - Remove control characters (except \\n \\t)
    - Optionally force latin-1 safety (drop characters not representable)
    """
    if s is None:
        return ""

    s = str(s)

    # 1) Replace ligatures first (from PDFs)
    for k, v in LIGATURES.items():
        s = s.replace(k, v)

    # 2) Replace punctuation that commonly causes encoding issues
    # (do before NFKC so it doesn't re-introduce weird forms)
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)

    # 3) Normalize unicode to a standard form
    s = unicodedata.normalize("NFKC", s)

    # 4) Remove control characters (keep \n and \t)
    s = "".join(ch for ch in s if ch in ("\n", "\t") or ord(ch) >= 32)

    # 5) If you want to guarantee no latin-1 crashes anywhere:
    if latin1_safe:
        # keep Swedish chars (åäö) fine; drop emoji/rare symbols that can break clients
        s = s.encode("latin-1", errors="ignore").decode("latin-1")

    # 6) Optional truncate (useful for metadata previews)
    if max_len is not None and len(s) > max_len:
        s = s[:max_len]

    return s

In [45]:
def clean_metadata(d: dict) -> dict:
    """
    Pinecone metadata values must be: string, number, boolean, or list of strings.
    Removes None/NaN and normalizes strings so upsert won't fail.
    """
    clean = {}
    for k, v in d.items():
        if v is None:
            continue

        # skip NaN
        try:
            if pd.isna(v):
                continue
        except Exception:
            pass

        if isinstance(v, str):
            clean[k] = normalize_text(v, latin1_safe=True)
        elif isinstance(v, list):
            # keep only strings, normalized; ignore None/non-string values
            clean[k] = [normalize_text(x, latin1_safe=True) for x in v if isinstance(x, str)]
        else:
            # numbers/bools OK
            clean[k] = v

    return clean

In [46]:
from tqdm.auto import tqdm
import numpy as np

data = df_chunks.reset_index(drop=True)
batch_size = 64

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)
    batch = data.iloc[i:i_end].to_dict(orient="records")

    ids = [str(r["id"]) for r in batch]

    # normalize chunk text before embedding
    texts = [normalize_text(r.get("chunk") or "", latin1_safe=False) for r in batch]
    # ^ latin1_safe=False here is fine because embeddings can handle unicode;
    # we only need latin1_safe=True for metadata going through Pinecone client.

    embeds = encoder(texts)

    if isinstance(embeds, np.ndarray):
        embeds = embeds.tolist()

    if len(embeds) != len(ids):
        raise ValueError(f"Embedding count mismatch: {len(embeds)} vs {len(ids)}")
    if len(embeds) > 0 and len(embeds[0]) != dims:
        raise ValueError(f"Embedding dim mismatch: got {len(embeds[0])}, expected {dims}")

    metadata = []
    for r, txt in zip(batch, texts):
        md = {
            "doc_id": r.get("doc_id"),
            "pmcid": r.get("pmcid"),
            "paper_id": r.get("paper_id"),
            "title": r.get("title"),
            "year": r.get("year"),
            "journal": r.get("journal"),
            "url": r.get("url"),
            "page": r.get("page"),
            # keep preview short + latin1-safe
            "chunk_preview": normalize_text(txt, latin1_safe=True, max_len=300),
        }
        metadata.append(clean_metadata(md))

    vectors = [{"id": ids[j], "values": embeds[j], "metadata": metadata[j]} for j in range(len(ids))]

    index.upsert(vectors=vectors, namespace="pmc")

index.describe_index_stats()

0it [00:00, ?it/s]

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'pmc': {'vector_count': 52829}},
 'total_vector_count': 52829}

## 6. Fetch a PMC article page

It takes information directly from NCBI / PubMed Central (PMC) over the internet.
- “Search/fetch from PMC servers” → parse XML → return abstract.

In [108]:
from langchain_core.tools import tool
import requests
import xml.etree.ElementTree as ET
import re

UA = {"User-Agent": "DataDoc/1.0 (research project)"}

def _clean_ws(s: str) -> str:
    # collapse repeated whitespace but keep newlines
    s = re.sub(r"[ \t\r\f\v]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

@tool("fetch_pmc_xml")
def fetch_pmc_xml(pmcid: str) -> str:
    """
    Fetch the FULL abstract from PMC using NCBI E-utilities (EFetch) XML.
    Returns a plain string (abstract) or an error message if not found.
    """
    pmcid = pmcid.strip()
    if pmcid.upper().startswith("PMC"):
        pmc_numeric = pmcid[3:]
    else:
        pmc_numeric = pmcid

    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {"db": "pmc", "id": pmc_numeric, "retmode": "xml"}

    res = requests.get(efetch_url, params=params, headers=UA, timeout=30)
    res.raise_for_status()
    root = ET.fromstring(res.text)

    # 1) Try common JATS locations for abstracts
    abstract_nodes = []
    abstract_nodes.extend(root.findall(".//abstract"))
    abstract_nodes.extend(root.findall(".//trans-abstract"))  # sometimes translated abstracts

    parts: list[str] = []

    for abs_node in abstract_nodes:
        # Prefer <p> sections if they exist
        ps = abs_node.findall(".//p")
        if ps:
            for p in ps:
                txt = "".join(p.itertext())
                txt = _clean_ws(txt)
                if txt:
                    parts.append(txt)
        else:
            # No <p> tags: take all text under <abstract>
            txt = "".join(abs_node.itertext())
            txt = _clean_ws(txt)
            if txt:
                parts.append(txt)

    # Remove duplicates while preserving order (sometimes abstract repeats)
    seen = set()
    uniq_parts = []
    for t in parts:
        if t not in seen:
            seen.add(t)
            uniq_parts.append(t)

    abstract = "\n\n".join(uniq_parts).strip()
    return abstract if abstract else "Abstract not found in PMC XML."

In [109]:
print(fetch_pmc_xml.invoke({"pmcid": "PMC11231708"}))

Deteriorated sinusitis and increased adiposity relative to muscle mass may affect quality of life in patients with asthma. However, whether these effects are observed regardless of intrapulmonary pathology is unknown.

We evaluated the correlation of the cross-sectional ratio of abdominal visceral fat (VF) to erector spinae muscle (ESM) and sinus findings based on Lund-Mackey scoring system (LMS) on computed tomography (CT) with the impaired score of the Asthma Quality of Life Questionnaire (AQLQ), regardless of airway and parenchymal disease, in patients with asthma.

We recruited participants from the Hokkaido-based severe asthma cohort who had completed AQLQ and CT examination at the entry. The participants were divided into high (highest) and low (other quartiles) groups on the bases of the extrapulmonary indices. Multivariate analysis examined the association of VF/ESM for the adiposity-to-muscle ratio and LMS with AQLQ after adjusting for the airway fractal dimension for airway i

## 7. Integrating Google SerpAPI for Web Search

In [49]:
from dotenv import load_dotenv, find_dotenv

# Load the API keys from .env
load_dotenv(find_dotenv(), override=True)

True

In [134]:
from langchain_core.tools import tool
from serpapi import GoogleSearch
import os
from getpass import getpass

SERPAPI_KEY = os.getenv("SERPAPI_KEY") or getpass("SerpAPI key: ")

serpapi_params = {
    "engine": "google",
    "api_key": SERPAPI_KEY,
}

@tool("web_search")
def web_search(query: str) -> str:
    """
    Finds general information using a Google search via SerpAPI.

    Args:
        query (str): The search query string.

    Returns:
        str: Top results formatted as:
             Title
             Snippet
             Link
             ---
             ...
    """
    search = GoogleSearch({
        **serpapi_params,
        "q": query,
        "num": 5
    })

    results = search.get_dict().get("organic_results", [])

    if not results:
        return "No results found."

    formatted_results = "\n---\n".join(
        "\n".join([
            r.get("title", "").strip(),
            r.get("snippet", "").strip(),
            r.get("link", "").strip(),
        ]).strip()
        for r in results
    )

    return formatted_results

In [52]:
print(web_search.invoke({"query": "metabolic syndrome alcohol Sweden site:pmc.ncbi.nlm.nih.gov"}))

Comparison of associations between alcohol consumption ...
Adjusted for age and sex, medium-high alcohol consumption was associated with lower odds of MetS compared to low consumption, while no difference was observed ...
https://pmc.ncbi.nlm.nih.gov/articles/PMC11231701/
---
Consumption of Alcoholic Beverages and the Prevalence of ...
One prospective study found a linear increase in metabolic syndrome risk with an increase in alcohol consumption [8].
https://pmc.ncbi.nlm.nih.gov/articles/PMC6893759/
---
Alcohol use disorder and alcohol-related mortality after ...
Increased sensitivity to alcohol due to an altered alcohol metabolism has been observed, especially after gastric bypass (GBP) surgery.
https://pmc.ncbi.nlm.nih.gov/articles/PMC12509842/
---
Interactions between the metabolic syndrome and alcohol ...
People with an alcohol use disorder (AUD) have a higher risk of T2DM compared with people in the general population. ... This is most likely due to the ...
https://pmc.ncbi.nlm.n

## 8. Creating RAG Tools for Retrieval-Augmented Generation (RAG)

In [136]:
from langchain_core.tools import tool
import re
from typing import Any

In [135]:
# Helpers
def normalize_pmcid(pmcid: str) -> str:
    """Return canonical PMCID like 'PMC11231701'."""
    pmcid = (pmcid or "").strip()
    if not pmcid:
        return ""
    if not pmcid.upper().startswith("PMC"):
        pmcid = "PMC" + pmcid
    return "PMC" + pmcid[3:]  # normalize casing

def format_rag_contexts(matches: list[dict[str, Any]]) -> str:
    """
    Format Pinecone matches into readable context for LLM/user.
    Uses safe fallbacks for common metadata field names.
    """
    if not matches:
        return "No matches found."

    blocks: list[str] = []
    for i, m in enumerate(matches, start=1):
        md = m.get("metadata") or {}
        score = m.get("score", "")

        # Common metadata keys (use fallbacks because schemas differ)
        title = md.get("title", "")
        pmcid = md.get("pmcid", "")
        year = md.get("year", "") or md.get("pub_year", "") or md.get("pubdate", "")
        journal = md.get("journal", "")
        page = md.get("page", "") or md.get("page_number", "")
        url = md.get("url", "") or md.get("article_url", "") or md.get("pdf_url", "")
        chunk = md.get("chunk_preview") or md.get("chunk") or md.get("text") or ""

        # Make chunk readable
        chunk = " ".join(str(chunk).split())

        lines = [
            f"[Match {i}] Score: {score}",
            f"Title: {title}",
            f"PMCID: {pmcid}",
        ]
        if year:    lines.append(f"Year/PubDate: {year}")
        if journal: lines.append(f"Journal: {journal}")
        if page != "": lines.append(f"Page: {page}")
        if url:     lines.append(f"URL: {url}")
        if chunk:   lines.append(f"Chunk: {chunk}")

        blocks.append("\n".join(lines).strip())

    return "\n---\n".join(blocks)

## `rag_search` (global search)

Use this when the user **hasn’t chosen an article** yet.

- Searches **all chunks in Pinecone** (all papers you indexed)
- Best for questions like:
  - “Find papers about metabolic syndrome and alcohol in Sweden”
  - “What does research say about NAFLD and alcohol?”
- It helps you **discover** which paper(s) are relevant.

## `rag_search_filter_pmc` (search inside one specific paper)

Use this when the user **already knows the paper** (or you already picked it).

- Searches **only within one PMCID** using `filter={"pmcid": ...}`
- Best for questions like:
  - “In PMC11231701, what did they conclude?”
  - “What are the methods in this paper?”
- It prevents the model from mixing info from other papers and keeps answers **consistent**.

## Why filtering matters (real RAG reason)

If you ask: *“What’s the conclusion?”* and you use global `rag_search`,
Pinecone might return chunks from **different papers** that all mention “conclusion”.
Then the LLM may combine them into one answer → **wrong or mixed**.

With `rag_search_filter_pmc`, you guarantee:
- all chunks come from the same article
- answers match that one paper
- easier citations (“this is from PMC11231701”)

## Typical best workflow in the app

1. User asks a topic → call `rag_search(query)` to find relevant papers
2. Show top candidates (pmcid / title)
3. User picks one (or your agent picks the best)
4. All follow-up questions use `rag_search_filter_pmc(query, pmcid)`

In [143]:
# Tools
@tool("rag_search")
def rag_search(query: str) -> str:
    """
    Search ALL PMC chunks in Pinecone (no PMCID filter).
    """
    query = (query or "").strip()
    if not query:
        return "Please provide a non-empty query."

    # encoder returns list of vectors; take first
    xq = encoder([query])[0]

    xc = index.query(
        vector=xq,
        top_k=6,
        include_metadata=True,
        namespace="pmc"
    )

    return format_rag_contexts(xc.get("matches", []))

@tool("rag_search_filter_pmc")
def rag_search_filter_pmc(query: str, pmcid: str) -> str:
    """
    Search Pinecone but restrict results to ONE PMC article (PMCID filter).
    """
    query = (query or "").strip()
    if not query:
        return "Please provide a non-empty query."

    pmcid_norm = normalize_pmcid(pmcid)
    if not pmcid_norm:
        return "Please provide a PMCID like 'PMC11231701'."

    xq = encoder([query])[0]

    xc = index.query(
        vector=xq,
        top_k=6,
        include_metadata=True,
        namespace="pmc",
        filter={"pmcid": pmcid_norm}
    )

    return format_rag_contexts(xc.get("matches", []))

In [137]:
print(rag_search.invoke({"query": "association between alcohol consumption and metabolic syndrome"}))

[Match 1] Score: 0.7077445983886719
Title: Non-alcoholic fatty liver disease: A patient guideline
PMCID: PMC8514420
Journal: JHEP Reports
Page: 31.0
URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC8514420/
Chunk: https://doi.org/10.1016/j.jhepr.2019.08.002. [33] Blomdahl J, Nasr P, Ekstedt M, Kechagias S. Moderate alcohol con- sumption is associated with advanced fibrosis in non-alcoholic fatty liver disease and shows a synergistic effect with type 2 diabetes melli- tus. Metab Clin Exp 2021:115. https://doi.
---
[Match 2] Score: 0.6968680024147034
Title: Serum keratin-18 detects hepatic inflammation and predicts progression in compensated alcohol-associated liver disease
PMCID: PMC9701478
Journal: Hepatology Communications
Page: 11.0
URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC9701478/
Chunk: alcohol consumption and the metabolic syndrome: cofactors for progressive fatty liver disease. J Hepatol. 2018;68:251- 67. 22. Saunders JB, Aasland OG, Babor TF, de la Fuente JR, Grant M. Developm

In [139]:
print(rag_search_filter_pmc.invoke({
    "query": "association between alcohol consumption and metabolic syndrom",
    "pmcid": "PMC10954435"
}))

No matches found.


## 9. Implementing the Final Answer Generation Tool

In [229]:
from langchain_core.tools import tool
from pydantic import BaseModel, Field
from typing import List, Union

class FinalAnswerArgs(BaseModel):
    question: str = Field(..., description="The user's question")
    introduction: str = Field(..., description="Intro paragraph")
    research_steps: Union[str, List[str]] = Field(..., description="Steps taken")
    main_body: str = Field(..., description="Main answer text")
    conclusion: str = Field(..., description="Short conclusion")
    sources: Union[str, List[str]] = Field(default_factory=list, description="Sources list")

@tool("final_answer", args_schema=FinalAnswerArgs)
def final_answer(
    question: str,
    introduction: str,
    research_steps: Union[str, List[str]],
    main_body: str,
    conclusion: str,
    sources: Union[str, List[str]],
) -> str:
    """
    Returns a formatted research report string (DataDoc style).
    """

    # bullets for steps
    if isinstance(research_steps, list):
        research_steps = "\n".join([f"- {r}" for r in research_steps])
    else:
        research_steps = str(research_steps)

    # bullets for sources
    if isinstance(sources, list):
        sources = "\n".join([f"- {s}" for s in sources])
    else:
        sources = str(sources)

    return (
        f"**Question:** {question}\n\n"
        f"{introduction}\n\n"
        f"## Research steps\n{research_steps}\n\n"
        f"## Main body\n{main_body}\n\n"
        f"## Conclusion\n{conclusion}\n\n"
        f"## Sources\n{sources if sources.strip() else '- (No sources)'}"
    )

In [230]:
question = "What does this paper say about alcohol and metabolic syndrome?"

report = final_answer.invoke({
    "question": question,
    "introduction": "This answer is generated for study/research purposes (not medical advice).",
    "research_steps": [
        "Embedded the query with text-embedding-3-small.",
        "Queried Pinecone (namespace: pmc) for the top matching chunks.",
        "Summarized findings based on retrieved contexts."
    ],
    "main_body": "Write your answer here (or generate it with an LLM).",
    "conclusion": "Short summary here.",
    "sources": [
        "Paper title (PMCxxxx) p. 3 https://pmc.ncbi.nlm.nih.gov/articles/PMCxxxx/",
        "Another paper title (PMCyyyy) p. 1 https://pmc.ncbi.nlm.nih.gov/articles/PMCyyyy/"
    ]
})

print(report)

**Question:** What does this paper say about alcohol and metabolic syndrome?

This answer is generated for study/research purposes (not medical advice).

## Research steps
- Embedded the query with text-embedding-3-small.
- Queried Pinecone (namespace: pmc) for the top matching chunks.
- Summarized findings based on retrieved contexts.

## Main body
Write your answer here (or generate it with an LLM).

## Conclusion
Short summary here.

## Sources
- Paper title (PMCxxxx) p. 3 https://pmc.ncbi.nlm.nih.gov/articles/PMCxxxx/
- Another paper title (PMCyyyy) p. 1 https://pmc.ncbi.nlm.nih.gov/articles/PMCyyyy/


## 10. Initializing the "Oracle" LLM

In [231]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_prompt = """You are the oracle, the great AI decision-maker.
Given the user's query, decide what to do using the tools available.

Hard rules:
- If the scratchpad already contains the same tool + same input, do NOT repeat it.
- Do NOT use any tool more than twice total.
- Prefer diverse sources (RAG + web) when appropriate.
- When you have enough information, call final_answer and stop.
- If rag_search_filter_pmc returns 'No matches found' do not use this tool again.

Tool input schemas (IMPORTANT):
- rag_search expects: {{\"query\": \"...\"}}
- rag_search_filter_pmc expects: {{\"query\": \"...\", \"pmcid\": \"PMC...\"}}
- fetch_pmc_xml expects: {{\"pmcid\": \"PMC...\"}}
- web_search expects: {{\"query\": \"...\"}}
- final_answer expects the structured fields in its schema.

Strategy:
1) Use rag_search once.
2) Optionally use rag_search_filter_pmc or fetch_pmc_xml.
3) Optionally use web_search once.
4) Then call final_answer.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    ("assistant", "scratchpad:\n{scratchpad}"),
])


In [232]:
from langchain_openai import ChatOpenAI
import os

# Initialize the OpenAI language model with specific settings.
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=os.environ["OPENAI_API_KEY"],  # in newer LangChain this is api_key
    temperature=0,
)

# Define the list of tools available to the oracle.
tools = [
    rag_search_filter_pmc,
    rag_search,
    fetch_pmc_xml,
    web_search,
    final_answer,
]

# Function to create the scratchpad from the intermediate tool calls.
from typing import Any

def create_scratchpad(intermediate_steps: list[tuple[Any, Any]]) -> str:
    research_steps: list[str] = []

    for step_i, (action, observation) in enumerate(intermediate_steps, start=1):
        # action usually has: tool, tool_input, log
        tool_name = getattr(action, "tool", "unknown_tool")
        tool_input = getattr(action, "tool_input", None)
        log = getattr(action, "log", "")

        # Some templates store placeholder logs like "TBD"
        if log == "TBD":
            continue

        research_steps.append(
            f"Step {step_i}\n"
            f"Tool: {tool_name}\n"
            f"Input: {tool_input}\n"
            f"Output: {observation}"
        )

    return "\n---\n".join(research_steps)

# Define the oracle's decision-making pipeline.
from langchain_core.runnables import RunnableLambda

oracle = (
    {
        "input": lambda x: x["input"],
        "chat_history": lambda x: x.get("chat_history", []),
        "scratchpad": lambda x: create_scratchpad(x.get("intermediate_steps", [])),
    }
    | prompt
    | llm.bind_tools(tools, tool_choice="any")
)

result = oracle.invoke({
    "input": "Your question here",
    "chat_history": [],
    "intermediate_steps": [],   # will fill up as tools run (or keep empty at start)
})

## 11. Testing the Oracle and the Tools

**rag_search**
user_input = "What are common risk factors for non-alcoholic fatty liver disease?"

**rag_search_filter_pmc**
user_input = "In PMC5897854, what limitations do the authors list and what future research do they suggest? Use only this paper."

**web_search**
user_input = 'Who won the Super Bowl 2024?'

**fetch_pmc_xml**
user_input = "Summarize the main findings of PMC5808635."

In [235]:
#rag_search
#user_input = "What are common risk factors for non-alcoholic fatty liver disease?"

#rag_search_filter_pmc
#user_input = "In PMC10029939, what limitations do the authors list and what future research do they suggest? Use only this paper."

#web_search
#user_input = 'Who won the Super Bowl 2024?'

#fetch_pmc_xml
user_input = "Summarize the main findings of PMC5808635."

inputs = {
    "input": user_input,
    "chat_history": [],
    "intermediate_steps": [],   # starts empty
}

out = oracle.invoke(inputs)
print(out)

content='' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 509, 'total_tokens': 536, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_deacdd5f6f', 'id': 'chatcmpl-CqlVaxf84QU0ZpwHTzSDUPFscUpFP', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='lc_run--019b5708-3602-7881-94d6-dd172d44b894-0' tool_calls=[{'name': 'rag_search_filter_pmc', 'args': {'query': 'main findings', 'pmcid': 'PMC5808635'}, 'id': 'call_N3SocHAwKuqLQXphW7xmSaZM', 'type': 'tool_call'}] usage_metadata={'input_tokens': 509, 'output_tokens': 27, 'total_tokens': 536, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [236]:
# Display the name of the tool
out. tool_calls[0]['name']

'rag_search_filter_pmc'

## 12. Building a Decision-Making Pipeline

In [237]:
# Helpers
def _tool_called(steps, tool_name: str) -> bool:
    return any(getattr(a, "tool", None) == tool_name for a, _ in steps)

def _last_observation_for(steps, tool_name: str) -> str:
    for a, obs in reversed(steps):
        if getattr(a, "tool", None) == tool_name:
            return obs or ""
    return ""

def _rag_no_matches_for_same_pmc(steps, pmcid: str) -> bool:
    # did we already get "No matches found." for this PMCID?
    for a, obs in steps:
        if getattr(a, "tool", None) == "rag_search_filter_pmc":
            args = getattr(a, "tool_input", {}) or {}
            if args.get("pmcid") == pmcid and (obs or "").strip() == "No matches found.":
                return True
    return False

In [238]:
# set flag when fetch_pmc_xml succeeds
def run_tool(state: dict) -> dict:
    steps = state.get("intermediate_steps", [])
    last_action, _ = steps[-1]

    tool_name = last_action.tool
    tool_args = last_action.tool_input or {}

    observation = tool_str_to_func[tool_name].invoke(tool_args)

    # update log too
    updated_action = AgentAction(
        tool=last_action.tool,
        tool_input=last_action.tool_input,
        log=str(observation),
    )

    new_steps = steps[:-1] + [(updated_action, str(observation))]

    # STOP FLAG: if we got a real abstract, store it in state
    updates = {"intermediate_steps": new_steps}

    if tool_name == "fetch_pmc_xml":
        obs_text = (observation or "").strip()
        if obs_text and "Abstract not found" not in obs_text:
            updates["have_abstract"] = True
            updates["abstract_text"] = obs_text

    return {**state, **updates}

In [239]:
from langchain_core.agents import AgentAction

MAX_TOOL_STEPS = 8

def run_oracle(state: dict) -> dict:
    steps = state.get("intermediate_steps", [])
    question = state.get("input", "")

    # ✅ HARD STOP: if abstract exists, jump straight to final_answer
    if state.get("have_abstract") and state.get("abstract_text"):
        abs_text = state["abstract_text"]

        action = AgentAction(
            tool="final_answer",
            tool_input={
                "question": question,
                "introduction": "This answer is generated for study/research purposes (not medical advice).",
                "research_steps": [
                    "Fetched abstract from PMC via NCBI EFetch (XML).",
                    "Summarized the abstract."
                ],
                "main_body": (
                    "### Abstract (retrieved)\n"
                    f"{abs_text}\n\n"
                    "### What this paper is about (summary)\n"
                    "- Describes the REAPPEAR study on preventing acute recurrent pancreatitis (ARP) related to alcohol and smoking.\n"
                    "- International multicentre randomized controlled trial (REAPPEAR-T) + prospective cohort (REAPPEAR-C).\n"
                    "- Enrolls daily smokers hospitalized with alcohol-induced acute pancreatitis.\n"
                    "- Tests a combined cessation program with structured follow-ups.\n"
                    "- Primary endpoint: 2-year recurrence of AP and/or 2-year all-cause mortality; also evaluates cost-effectiveness.\n"
                ),
                "conclusion": "Summary is based on the retrieved abstract text.",
                "sources": [f"PMC {question} (abstract via NCBI EFetch)"]
            },
            log="TBD"
        )
        return {**state, "intermediate_steps": steps + [(action, "")]}

    # safety stop
    if len(steps) >= MAX_TOOL_STEPS:
        action = AgentAction(
            tool="final_answer",
            tool_input={
                "question": question,
                "introduction": "This answer is generated for study/research purposes (not medical advice).",
                "research_steps": [f"Stopped after {MAX_TOOL_STEPS} tool steps to prevent looping."],
                "main_body": "Stopped due to loop prevention.",
                "conclusion": "Stopped early due to recursion/loop prevention.",
                "sources": []
            },
            log="TBD"
        )
        return {**state, "intermediate_steps": steps + [(action, "")]}

    # normal oracle decision
    out = oracle.invoke(state)

    if not getattr(out, "tool_calls", None):
        action = AgentAction(tool="final_answer", tool_input={
            "question": question,
            "introduction": "This answer is generated for study/research purposes (not medical advice).",
            "research_steps": ["Model did not request a tool; returning final answer."],
            "main_body": "No tool call was produced by the oracle.",
            "conclusion": "Stopped.",
            "sources": []
        }, log="TBD")
        return {**state, "intermediate_steps": steps + [(action, "")]}

    tool_name = out.tool_calls[0]["name"]
    tool_args = out.tool_calls[0].get("args", {}) or {}

    action = AgentAction(tool=tool_name, tool_input=tool_args, log="TBD")
    return {**state, "intermediate_steps": steps + [(action, "")]}

In [240]:
report = run_loop("What is the association between alcohol consumption and metabolic syndrome in Swedish studies?")
print(report)

**Question:** What is the association between alcohol consumption and metabolic syndrome in Swedish studies?

The relationship between alcohol consumption and metabolic syndrome has been a subject of interest in various studies, including those conducted in Sweden. Metabolic syndrome is a cluster of conditions that increase the risk of heart disease, stroke, and diabetes. Understanding how alcohol consumption influences this syndrome is crucial for public health.

## Research steps
- Conducted a RAG search to find relevant studies on the association between alcohol consumption and metabolic syndrome in Swedish populations.
- Used RAG search with a PMCID filter to focus on specific articles related to the query.
- Performed a web search to gather additional information from Swedish studies on the topic.

## Main body
The interaction between alcohol consumption and metabolic syndrome has been explored in several studies, including those from Sweden. A notable study by Åberg et al. (2018)

## 13. Defining the Agent State

In [241]:
from typing import TypedDict, Annotated, List
from langchain_core.agents import AgentAction
from langchain_core.messages import BaseMessage
import operator

class AgentState(TypedDict, total=False):
    input: str
    chat_history: List[BaseMessage]
    intermediate_steps: List[tuple[AgentAction, str]]

    # new flags we control (stop condition)
    have_abstract: bool
    abstract_text: str

## 14. Defining the Graph for Decision-Making

In [242]:
from langgraph.graph import StateGraph, END

graph = StateGraph(AgentState)
graph.add_node("oracle", run_oracle)
graph.add_node("rag_search", run_tool)
graph.add_node("rag_search_filter_pmc", run_tool)
graph.add_node("fetch_pmc_xml", run_tool)
graph.add_node("web_search", run_tool)
graph.add_node("final_answer", run_tool)

graph.set_entry_point("oracle")

graph.add_conditional_edges(
    "oracle",
    router,
    {
        "rag_search": "rag_search",
        "rag_search_filter_pmc": "rag_search_filter_pmc",
        "fetch_pmc_xml": "fetch_pmc_xml",
        "web_search": "web_search",
        "final_answer": "final_answer",
    },
)

graph.add_edge("rag_search", "oracle")
graph.add_edge("rag_search_filter_pmc", "oracle")
graph.add_edge("fetch_pmc_xml", "oracle")
graph.add_edge("web_search", "oracle")
graph.add_edge("final_answer", END)

app = graph.compile()

In [243]:
from IPython.display import Image, display
from langchain_core.runnables.graph import MermaidDrawMethod

print(app.get_graph().draw_mermaid())

---
config:
  flowchart:
    curve: linear
---
graph TD;
	__start__([<p>__start__</p>]):::first
	oracle(oracle)
	rag_search(rag_search)
	rag_search_filter_pmc(rag_search_filter_pmc)
	fetch_pmc_xml(fetch_pmc_xml)
	web_search(web_search)
	final_answer(final_answer)
	__end__([<p>__end__</p>]):::last
	__start__ --> oracle;
	fetch_pmc_xml --> oracle;
	oracle -.-> fetch_pmc_xml;
	oracle -.-> final_answer;
	oracle -.-> rag_search;
	oracle -.-> rag_search_filter_pmc;
	oracle -.-> web_search;
	rag_search --> oracle;
	rag_search_filter_pmc --> oracle;
	web_search --> oracle;
	final_answer --> __end__;
	classDef default fill:#f2f0ff,line-height:1.2
	classDef first fill-opacity:0
	classDef last fill:#bfb6fc



In [244]:
from mermaid import Mermaid

mermaid_text = app.get_graph().draw_mermaid()

Mermaid(mermaid_text)

In [245]:
# Run the graph with input.
output = app.invoke({
    "input": "tell me something about PMC8728420?",
    "chat_history": [],
    "intermediate_steps": [],
})

print(output)

{'input': 'tell me something about PMC8728420?', 'chat_history': [], 'intermediate_steps': [(AgentAction(tool='rag_search', tool_input={'query': 'PMC8728420'}, log='[Match 1] Score: 0.4663238525390625\nTitle: Association of Fasting C-Peptide to High Density Lipoprotein Cholesterol Ratio with Non-Alcoholic Fatty Liver Disease in Chinese Type 2 Diabetes Mellitus Patients: A Cross-Sectional Study\nPMCID: PMC12704183\nJournal: Diabetes, Metabolic Syndrome and Obesity\nPage: 10.0\nURL: https://pmc.ncbi.nlm.nih.gov/articles/PMC12704183/\nChunk: 4517\n---\n[Match 2] Score: 0.443820983171463\nTitle: UEG Week 2017 Poster Presentations\nPMCID: PMC7672678\nJournal: United European Gastroenterology Journal\nPage: 177.0\nURL: https://pmc.ncbi.nlm.nih.gov/articles/PMC7672678/\nChunk: muscular synthetic pro-fibrotic switch. Disclosure of Interest:All authors have declared no conflicts of interest. MONDAY, OCTOBER 30, 2017 09:00-17:00 OESOPHAGEAL, GASTRIC AND DUODENAL DISORDERS I - HALL 7_____________

## 15. Building a Formatted Final Report

In [246]:
def build_report(output) -> str:
    # If final_answer returns a formatted string, just return it
    if isinstance(output, str):
        return output

    # Otherwise treat it as dict (teacher style)
    research_steps = output.get("research_steps", [])
    if isinstance(research_steps, list):
        research_steps = "\n".join([f"- {r}" for r in research_steps])
    else:
        research_steps = str(research_steps)

    sources = output.get("sources", [])
    if isinstance(sources, list):
        sources = "\n".join([f"- {s}" for s in sources])
    else:
        sources = str(sources)

    return f"""**Question:** {output.get("question","")}

INTRODUCTION

{output.get("introduction","")}

RESEARCH STEPS
{research_steps}

REPORT

{output.get("main_body","")}

CONCLUSION

{output.get("conclusion","")}

SOURCES
{sources}
"""

In [247]:
action, observation = output["intermediate_steps"][-1]  # observation is dict now
print(build_report(observation))

**Question:** tell me something about PMC8728420?

PMC8728420 is a research article related to age-related macular degeneration (AMD), a common eye condition that affects the central part of the retina and can lead to vision impairment.

## Research steps
- Performed a RAG search to find information related to PMC8728420.
- Conducted a web search to gather more details about the article.

## Main body
The article associated with PMC8728420 focuses on the incidence, progression, and risk factors of age-related macular degeneration (AMD). AMD is a significant cause of vision impairment, particularly in older adults. The study aims to estimate the incidence and progression of AMD across a wide age range, providing valuable insights into how this condition develops and progresses over time. The research is crucial for understanding the factors that contribute to AMD and for developing strategies to manage and treat this condition effectively.

## Conclusion
PMC8728420 provides important in

In [248]:
from langchain_core.messages import HumanMessage

def ask_datadoc(question: str, show_outputs: bool = False):
    """
    Run the graph for one user research question and return:
      - final_report (str)
      - tool_trace (list of dicts with tool, input, output_preview)
    """
    # Initial state must match AgentState keys
    state = {
        "input": question,
        "chat_history": [HumanMessage(content=question)],  # optional but useful
        "intermediate_steps": [],
    }

    out_state = app.invoke(state)

    steps = out_state.get("intermediate_steps", [])
    if not steps:
        return "No output.", []

    # Trace: which tools were used
    trace = []
    for action, observation in steps:
        trace.append({
            "tool": getattr(action, "tool", "unknown"),
            "input": getattr(action, "tool_input", {}),
            "output_preview": (observation or "")[:300].replace("\n", " ")
        })

    # Final report is the last observation (because final_answer ends the graph)
    final_report = steps[-1][1]

    # Print nicely
    print("\n=== Tools used (in order) ===")
    for i, t in enumerate(trace, start=1):
        print(f"{i}. {t['tool']} | input={t['input']}")
        if show_outputs:
            print(f"   output: {t['output_preview']}...")
    print("============================\n")

    return final_report, trace


In [221]:
report, trace = ask_datadoc("Tell me something about PMC8728419", show_outputs=False)
print(report)


=== Tools used (in order) ===
1. rag_search_filter_pmc | input={'query': 'PMC8728419', 'pmcid': 'PMC8728419'}
2. fetch_pmc_xml | input={'pmcid': 'PMC8728419'}
3. final_answer | input={'question': 'Tell me something about PMC8728419', 'introduction': 'This answer is generated for study/research purposes (not medical advice).', 'research_steps': ['Fetched abstract from PMC via NCBI EFetch (XML).', 'Summarized the abstract.'], 'main_body': '### Abstract (retrieved)\nAcute recurrent pancreatitis (ARP) due to alcohol and/or tobacco abuse is a preventable disease which lowers quality of life and can lead to chronic pancreatitis. The REAPPEAR study aims to investigate whether a combined patient education and cessation programme for smoking and alcohol prevents ARP.\n\nThe REAPPEAR study consists of an international multicentre randomised controlled trial (REAPPEAR-T) testing the efficacy of a cessation programme on alcohol and smoking and a prospective cohort study (REAPPEAR-C) assessing the

In [249]:
report, trace = ask_datadoc("is that good to eat kiwi before sleep", show_outputs=False)
print(report)


=== Tools used (in order) ===
1. rag_search | input={'query': 'eating kiwi before sleep benefits'}
2. rag_search_filter_pmc | input={'query': 'kiwi fruit sleep benefits', 'pmcid': 'PMC6796246'}
3. web_search | input={'query': 'benefits of eating kiwi before sleep'}
4. final_answer | input={'question': 'is that good to eat kiwi before sleep', 'introduction': 'Eating kiwi before sleep has been suggested to have potential benefits for improving sleep quality. This is due to certain compounds found in the fruit that may positively affect sleep patterns.', 'research_steps': ['Conducted a RAG search to find relevant studies on the benefits of eating kiwi before sleep.', 'Filtered the search results to focus on a specific PMC article for more detailed information.', 'Performed a web search to gather additional insights from various sources.'], 'main_body': 'Kiwi fruit is known to contain several compounds that may contribute to better sleep. These include serotonin, melatonin, and antioxidan