In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import time,re
import pandas as pd

In [2]:
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [5]:
# search_categories = {
#     "Medical Research": ["clinical trial", "medical research", "peer-reviewed", "medical study", "cohort analysis"],
#     "Treatments": ["drug treatment", "therapy", "vaccine","treatment plan"],
#     "Disease Outbreaks": ["pandemic", "epidemic", "infectious disease"],
#     "Technology Advancements": ["AI in healthcare", "telemedicine", "robotic surgery", "medical devices"],
#     "Healthcare Investments": ["healthcare startup", "health investment", "biotech funding"]
# }

search_categories = {
    "Medical Research": [
        "clinical trial", "systematic review", "medical study", "observational study", "case-control study"
    ],
    "Treatments": [
        "drug therapy", "immunotherapy", "gene therapy", "precision medicine", "vaccine", "chemotherapy"
    ],
    "Disease Outbreaks": [
        "pandemic", "epidemic", "emerging infectious diseases", "zoonotic diseases", "public health emergency"
    ],
    "Technology Advancements": [
        "AI in healthcare", "robotic surgery", "telemedicine", "wearable medical devices", "digital health", "virtual care"
    ],
    "Healthcare Investments": [
        "healthcare startup", "biotech investment", "health innovation funding", "venture capital health", "digital health investment"
    ],
    "Chronic Diseases": [
        "diabetes management", "chronic heart failure", "chronic kidney disease", "COPD", "arthritis"
    ],
    "Mental Health": [
        "depression treatment", "mental health crisis", "anxiety disorder", "bipolar disorder", "teletherapy"
    ],
    "Genomics & Precision Medicine": [
        "CRISPR", "genome sequencing", "pharmacogenomics", "cancer genomics", "personalized medicine"
    ],
    "Public Health & Policy": [
        "health equity", "social determinants of health", "healthcare access", "universal health coverage", "health policy reform"
    ],
    "Environmental & Climate Health": [
        "climate change and health", "air pollution exposure", "environmental toxins", "heatwaves", "vector-borne disease"
    ]
}


def parse_citation(citation):
    journal = re.match(r"^([^.]+)\.", citation)
    journal = journal.group(1).strip() if journal else ""

    date = re.search(r"(\d{4})\s+([A-Za-z]{3})", citation)
    year = date.group(1) if date else ""
    month = date.group(2) if date else ""

    volume_pages = re.search(r";(\d+):([\d\-–]+)", citation)
    volume = volume_pages.group(1) if volume_pages else ""
    
    doi = re.search(r"doi:\s*([\w./-]+)", citation)
    doi = doi.group(1) if doi else ""

    return journal, year, month, volume, doi



max_pages = 3
all_data = []

for category, keywords in search_categories.items():
    for keyword in keywords:
        print(f"\n🔍 Scraping '{keyword}' under category '{category}'")
        base_url = f"https://pubmed.ncbi.nlm.nih.gov/?term={keyword.replace(' ', '+')}"
        driver.get(base_url)
        time.sleep(2)

        current_page = 1
        while current_page <= max_pages:
            print(f"  📄 Page {current_page}")
            time.sleep(2)

            articles = driver.find_elements(By.CLASS_NAME, 'docsum-content')
            for article in articles:
                try:
                    title = article.find_element(By.CLASS_NAME, 'docsum-title').text
                    citation = article.find_element(By.CLASS_NAME, 'docsum-journal-citation').text
                    journal, year, month, volume, doi = parse_citation(citation)
                    all_data.append({
                        "Category": category,
                        "Keyword": keyword,
                        "Title": title,
                        "Citation": citation,
                        "Journal": journal,
                        "Year": year,
                        "Month": month,
                        "Volume": volume,
                        "DOI": doi
                    })
                except Exception as e:
                    print("    ❌ Error extracting article:", e)

            try:
                # Find and click the next-page button
                next_button = driver.find_element(By.CLASS_NAME, 'next-page-btn')
                driver.execute_script("arguments[0].click();", next_button)
                current_page += 1
                time.sleep(2)
            except NoSuchElementException:
                print("    🚫 No more pages.")
                break




🔍 Scraping 'clinical trial' under category 'Medical Research'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'systematic review' under category 'Medical Research'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'medical study' under category 'Medical Research'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'observational study' under category 'Medical Research'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'case-control study' under category 'Medical Research'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'drug therapy' under category 'Treatments'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'immunotherapy' under category 'Treatments'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'gene therapy' under category 'Treatments'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'precision medicine' under category 'Treatments'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'vaccine' under category 'Treatments'
  📄 Page 1
  📄 Page 2
  📄 Page 3

🔍 Scraping 'chemotherapy' under category 'Treatments'
  📄 Page 1
 

In [13]:
driver.quit()


In [14]:
df = pd.DataFrame(all_data)

In [17]:
df.shape

(1560, 9)

In [19]:
df.head()

Unnamed: 0,Category,Keyword,Title,Citation,Journal,Year,Month,Volume,DOI
0,Medical Research,clinical trial,An introduction to clinical trial design.,Paediatr Respir Rev. 2019 Nov;32:30-35. doi: 1...,Paediatr Respir Rev,2019,Nov,32.0,10.1016/j.prrv.2019.06.002.
1,Medical Research,clinical trial,Clinical Trial Considerations in Neuro-oncology.,Curr Treat Options Oncol. 2021 Jul 2;22(9):78....,Curr Treat Options Oncol,2021,Jul,,10.1007/s11864-021-00875-8.
2,Medical Research,clinical trial,Key concepts of clinical trials: a narrative r...,Postgrad Med. 2011 Sep;123(5):194-204. doi: 10...,Postgrad Med,2011,Sep,,10.3810/pgm.2011.09.2475.
3,Medical Research,clinical trial,Organizing a clinical trial for the new invest...,Urol Oncol. 2019 May;37(5):336-339. doi: 10.10...,Urol Oncol,2019,May,,10.1016/j.urolonc.2017.12.017.
4,Medical Research,clinical trial,SPIRIT 2013 statement: defining standard proto...,Ann Intern Med. 2013 Feb 5;158(3):200-7. doi: ...,Ann Intern Med,2013,Feb,,10.7326/0003-4819-158-3-201302050-00583.


In [21]:
df.to_csv("pubmed_categorized_data.csv", index=False)
print(f"\n✅ Done. {len(df)} articles scraped and saved.")


✅ Done. 1560 articles scraped and saved.
