In [1]:
from Bio import Entrez
Entrez.email = "basemal.alqusaimi@gmail.com"

time_blocks = [
    ("2000", "2004"),
    ("2005", "2009"),
    ("2010", "2014"),
    ("2015", "2019"),
    ("2020", "2025"),
]

base_query = "sepsis AND (biomarker OR machine learning OR diagnosis)"


In [2]:
all_pmids = []

for start, end in time_blocks:
    query = f'{base_query} AND ("{start}"[PDAT] : "{end}"[PDAT])'

    handle = Entrez.esearch(
        db="pubmed",
        term=query,
        retmax=400  # per block
    )

    results = Entrez.read(handle)
    handle.close()

    pmids = results["IdList"]
    all_pmids.extend(pmids)

    print(f"{start}-{end}: retrieved {len(pmids)} papers")

print("\nTotal PMIDs collected:", len(all_pmids))


2000-2004: retrieved 400 papers
2005-2009: retrieved 400 papers
2010-2014: retrieved 400 papers
2015-2019: retrieved 400 papers
2020-2025: retrieved 400 papers

Total PMIDs collected: 2000


In [3]:
all_pmids = list(set(all_pmids))
print("Unique PMIDs:", len(all_pmids))



Unique PMIDs: 2000


In [4]:
from Bio import Medline
import time

def fetch_batch(id_list):
    handle = Entrez.efetch(
        db="pubmed",
        id=",".join(id_list),
        rettype="medline",
        retmode="text"
    )
    records = list(Medline.parse(handle))
    handle.close()
    return records

all_records = []
batch_size = 200

pmid_list = all_pmids

for start in range(0, len(pmid_list), batch_size):
    end = start + batch_size
    batch_pmids = pmid_list[start:end]

    print(f"Fetching {start}–{end}...")

    batch_records = fetch_batch(batch_pmids)
    all_records.extend(batch_records)

    time.sleep(0.5)

print("Total records downloaded:", len(all_records))


Fetching 0–200...
Fetching 200–400...
Fetching 400–600...
Fetching 600–800...
Fetching 800–1000...
Fetching 1000–1200...
Fetching 1200–1400...
Fetching 1400–1600...
Fetching 1600–1800...
Fetching 1800–2000...
Total records downloaded: 2000


In [5]:
import pandas as pd

papers = []

for r in all_records:
    papers.append({
        "pmid": r.get("PMID", ""),
        "title": r.get("TI", ""),
        "year": r.get("DP", "")[:4],
        "abstract": r.get("AB", "")
    })

df = pd.DataFrame(papers)

df = df.dropna(subset=["abstract"])
df = df[df["abstract"].str.strip() != ""]
df = df[df["year"].str.isdigit()]
df["year"] = df["year"].astype(int)
df = df.drop_duplicates(subset=["pmid"])

print("Final dataset shape:", df.shape)
print("Year range:", df["year"].min(), "-", df["year"].max())


Final dataset shape: (1891, 4)
Year range: 1993 - 2026


In [6]:
df.to_csv("../data/sepsis_master_balanced.csv", index=False)
print(" Saved balanced dataset")


 Saved balanced dataset


SyntaxError: invalid character '–' (U+2013) (317239005.py, line 1)