In [19]:
import pandas as pd
from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.settings import Settings
from llama_index.core.chat_engine import CondenseQuestionChatEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.llms.llama_cpp import LlamaCPP
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding    
import faiss
import os
import json
import re
from Bio import Entrez
import time
import xmltodict
from pathlib import Path

In [None]:
training_dataset_path = r"E:\RAG_Models\BioASQ-training13b\training13b.json"
abstract_cache = Path(r".\Abstracts")
email = "an80@illinois.edu"

In [21]:
abstract_cache.mkdir(exist_ok=True)

In [14]:
Entrez.email = email

In [6]:
with open(training_dataset_path,"r") as file:
    json_data = json.load(file)

In [7]:
document_urls = []
questions = []
ground_truth = []
for data in json_data['questions'][:100]:
   document_urls.extend(data.get('documents'))
   questions.extend(data.get("body"))
   ground_truth.extend(data.get("ideal_answer"))


In [9]:
print(f'Checking for duplicate documents. Original number of documents: {len(document_urls)}')
document_urls = list(set(document_urls))
print(f'Number of documents after filtering: {len(document_urls)}')

Checking for duplicate documents. Original number of documents: 1142
Number of documents after filtering: 1136


In [10]:
PMID_RE = re.compile(r"/pubmed/(\d+)")

In [11]:
def pmid_from_url(url: str) -> str:
    m = PMID_RE.search(url)
    return m.group(1) if m else None

In [12]:
pmids = [pmid_from_url(doc) for doc in document_urls]

In [None]:
def batch_fetch_pmids(pmids, batch = 200):
    """Return {pmid: {'title','abstract','year'}}; caches and skips empty abstracts."""
    out = {}
    for i in range(0, len(pmids), batch):
        chunk = pmids[i:i + batch]
        need = [p for p in chunk if not (abstract_cache / f"{p}.json").exists()]
        if need:
            raw_xml = Entrez.efetch(
                db="pubmed",
                id=",".join(need),
                rettype="abstract",
                retmode="xml"
            ).read()
            xml = xmltodict.parse(raw_xml)
            for art in xml["PubmedArticleSet"]["PubmedArticle"]:
                pmid = art["MedlineCitation"]["PMID"]["#text"]
                art_info = art["MedlineCitation"]["Article"]
                title = art_info.get("ArticleTitle", "")

                # ---- robust abstract extraction ----
                abs_raw = art_info.get("Abstract", {}).get("AbstractText", [])
                if isinstance(abs_raw, list):
                    parts = [x.get("#text", "") if isinstance(x, dict) else str(x) for x in abs_raw]
                    abstract = " ".join(parts).strip()
                else:
                    abstract = str(abs_raw).strip()

                if not abstract:   # skip if empty
                    continue

                year = art_info["Journal"]["JournalIssue"]["PubDate"].get("Year", "Unknown")

                meta = {"title": title, "abstract": abstract, "year": year}
                (abstract_cache / f"{pmid}.json").write_text(json.dumps(meta))

        # load all cached (existing + newly saved)
        for pmid in chunk:
            fp = abstract_cache / f"{pmid}.json"
            if fp.exists():
                meta = json.loads(fp.read_text())
                if meta.get("abstract", "").strip():
                    out[pmid] = meta
        time.sleep(0.4)  # politeness
    return out

In [24]:
data_map = batch_fetch_pmids(pmids)

In [30]:
source_directory = r".\Abstracts"
out_csv = "pubmed_abstracts.csv"

In [42]:
abstracts_df = pd.DataFrame(columns=["title", "abstract", "year"])

In [43]:
for file in os.listdir(source_directory):
    file_path = os.path.join(source_directory,file)
    with open(file_path, 'r') as f:
        try:
            json_data = json.loads(f.read())
            title = json_data.get('title')   
            abstract = json_data.get('abstract')
            year = json_data.get('year')
            abstracts_df.loc[len(abstracts_df)] = [title, abstract, year]     
        except Exception as e:
            print(f"Error occurred while processing file {file}: {e}")

In [45]:
def dataframe_to_documents(df):
    docs = []
    for _, row in df.iterrows():
        text = row["abstract"]
        metadata = {
            "title": row.get("title", ""),
            "year": row.get("year", ""),
        }
        docs.append(Document(text=text, metadata=metadata))
    return docs

In [48]:
target_directory = os.path.join(os.getcwd(), out_csv)
abstracts_df.to_csv(target_directory, index=False)

In [46]:
documents = dataframe_to_documents(abstracts_df)

In [47]:
documents

[Document(id_='0dd7c636-6788-428f-8252-6deaa07c427e', embedding=None, metadata={'title': 'A single-blind, placebo-controlled trial of a simple acupuncture treatment in the cessation of smoking.', 'year': '1998'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Tobacco smoking is a major cause of preventable disease and premature death. Physicians should play an active role in the control of smoking by encouraging cessation and helping the smoker to choose the most suitable aid to cessation. To evaluate a simple, ear acupuncture treatment for the cessation of smoking. Randomized, single-blind, placebo-controlled trial of 78 currently smoking volunteers from the general public. Volunteers attended an acupuncture clinic in a general practice setting and were given a single treatment of electroacupuncture using two needles at either an