In [1]:
from Bio import Entrez 

Entrez.email = "basemal.alqusaimi@gmail.com"

query = "sepsis AND (biomarker OR machine learning OR diagonosis)"

handle = Entrez.esearch(db="pubmed", term=query, retmax=1000)

results = Entrez.read(handle) 
handle.close()

pmid_list = results["IdList"]

print("total papers found:", results["Count"])
print("PMIDs retrieved:", len(pmid_list))
print("First 5 PMIDs:", pmid_list[:5])

total papers found: 17156
PMIDs retrieved: 1000
First 5 PMIDs: ['41676147', '41676093', '41676092', '41676054', '41675927']


In [2]:
from Bio import Medline
import time

def fetch_batch(id_list):
    """Download a batch of PubMed records and return parsed MEDLINE entries."""
    handle = Entrez.efetch(
        db="pubmed",
        id=",".join(id_list),
        rettype="medline",
        retmode="text"
    )
    records = list(Medline.parse(handle))
    handle.close()
    return records


all_records = []

batch_size = 200

for start in range(0, len(pmid_list), batch_size):
    end = start + batch_size
    batch_pmids = pmid_list[start:end]

    print(f"Fetching records {start} to {end}...")

    batch_records = fetch_batch(batch_pmids)
    all_records.extend(batch_records)

    time.sleep(0.5)  # polite pause

print("\n✅ Total records downloaded:", len(all_records))


Fetching records 0 to 200...
Fetching records 200 to 400...
Fetching records 400 to 600...
Fetching records 600 to 800...
Fetching records 800 to 1000...

✅ Total records downloaded: 1000


In [3]:
import pandas as pd

papers = []

for r in all_records:
    papers.append({
        "pmid": r.get("PMID", ""),
        "title": r.get("TI", ""),
        "year": r.get("DP", "")[:4],
        "abstract": r.get("AB", "")
    })

df = pd.DataFrame(papers)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (1000, 4)


Unnamed: 0,pmid,title,year,abstract
0,41676147,Role of cytokine levels in pathogen classifica...,2026,BACKGROUND: The pathogenic role of inflammator...
1,41676093,Early risk stratification of sepsis-related li...,2026,BACKGROUND: Sepsis-related liver injury (SRLI)...
2,41676092,The endothelial activation and stress index as...,2026,BACKGROUND: The Endothelial Activation and Str...
3,41676054,Use of LVAD HeartMate 3 and Impella RP in the ...,2026,Heart failure (HF) is a leading cause of morbi...
4,41675927,Analysis of prognostic risk factors and risk m...,2025,OBJECTIVE: Through the machine learning Least ...


In [4]:
print("Missing abstracts:", df["abstract"].isna().sum())

df = df.dropna(subset=["abstract"])
df = df[df["abstract"].str.strip() != ""]

print("After removing missing abstracts:", df.shape)


Missing abstracts: 0
After removing missing abstracts: (971, 4)


In [5]:
df = df[df["year"].str.isdigit()]
df["year"] = df["year"].astype(int)

print("Year range:", df["year"].min(), "-", df["year"].max())


Year range: 2025 - 2026


In [6]:
df = df.drop_duplicates(subset=["pmid"])

print("After removing duplicates:", df.shape)


After removing duplicates: (971, 4)


In [7]:
df.to_csv("../data/sepsis_master.csv", index=False)

print(" Saved master dataset to data/sepsis_master.csv")


 Saved master dataset to data/sepsis_master.csv


In [8]:
print("Total papers:", len(df))
print("Papers per year (top 5):")
print(df["year"].value_counts().head())


Total papers: 971
Papers per year (top 5):
year
2025    713
2026    258
Name: count, dtype: int64
