In [1]:
!pip install requests pandas beautifulsoup4



In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
from typing import List, Dict

In [3]:
def is_non_academic(affiliation: str) -> bool:
    academic_keywords = ["university", "college", "institute", "school", "hospital", "lab", "centre", "center"]
    return not any(word in affiliation.lower() for word in academic_keywords)

In [4]:
def is_company_affiliation(affiliation: str) -> bool:
    company_keywords = ["pharma", "therapeutics", "biotech", "inc", "ltd", "corp", "gmbh", "llc"]
    return any(word in affiliation.lower() for word in company_keywords)

In [5]:
def fetch_pubmed_ids(query: str, max_results: int = 20) -> List[str]:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json"
    }
    api_response_obj = requests.get(url, params=params)
    api_response_obj.raise_for_status()
    paper_ids = api_response_obj.json()['esearchresult']['idlist']
    return paper_ids

In [6]:
def fetch_paper_details(paper_ids: List[str]) -> List[Dict]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids_str = ",".join(paper_ids)
    params = {
        "db": "pubmed",
        "id": ids_str,
        "retmode": "xml"
    }

    time.sleep(0.5)

    api_response_obj = requests.get(base_url, params=params)
    api_response_obj.raise_for_status()

    xml_data = BeautifulSoup(api_response_obj.text,"xml")
    paper_list = []

    for article in xml_data.find_all("PubmedArticle"):
        paper_entry = {}
        paper_entry["PubmedID"] = article.find("PMID").text if article.find("PMID") else "N/A"
        paper_entry["Title"] = article.ArticleTitle.text if article.ArticleTitle else "N/A"

        pub_date = article.find("PubDate")
        if pub_date and pub_date.find("Year"):
            paper_entry["Publication Date"] = pub_date.find("Year").text
        else:
            paper_entry["Publication Date"] = "N/A"

        non_academic_authors = []
        company_affiliations = []
        author_email = None

        for author in article.find_all("Author"):
            aff_info = author.find("AffiliationInfo")
            if aff_info and aff_info.Affiliation:
                aff_line = aff_info.Affiliation.text

                if is_non_academic(aff_line):
                    last_name = author.find("LastName")
                    if last_name:
                        non_academic_authors.append(last_name.text)

                if is_company_affiliation(aff_line):
                    company_affiliations.append(aff_line)

                if not author_email:
                    email_matched = re.search(r'[\w\.-]+@[\w\.-]+', aff_line)
                    if email_matched:
                        author_email = email_matched.group(0)

        paper_entry["Non-academic Author(s)"] = ", ".join(non_academic_authors) if non_academic_authors else "None"
        paper_entry["Company Affiliation(s)"] = ", ".join(company_affiliations) if company_affiliations else "None"
        paper_entry["Corresponding Author Email"] = author_email or "Not found"
        paper_list.append(paper_entry)

    return paper_list

def search_and_save(query: str, output_file: str = "pubmed_results.csv", max_results: int = 20):
    paper_ids = fetch_pubmed_ids(query, max_results)
    paper_list = fetch_paper_details(paper_ids)
    df = pd.DataFrame(paper_list)
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} results to {output_file}")
    return df

In [7]:
#example run you can change the query value, file name and number of results
query = "thyroid"
df = search_and_save(query,"thyroid_papers.csv", max_results=25)
df

Saved 25 results to thyroid_papers.csv


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40684066,Thermal ablation for radioactive iodine refrac...,2025,,"Department of Medical Ultrasound, Yantai Hospi...",liangping301@126.com.
1,40683960,Prediction of birthweight with early and mid-p...,2025,,,krishnaraj.chadaga@manipal.edu.
2,40683927,Integrative bioinformatics analysis identifies...,2025,,,Xiaobinlin006@163.com.
3,40683921,Impact of nitrosative stress and endothelial d...,2025,,,angelika.buczynska@umb.edu.pl.
4,40683612,Ubiquitin-specific peptidase 53 suppresses the...,2025,,,aihanaiheng@163.com.
5,40683229,Thyroid disruption and the association with mu...,2025,Zhang,"Hangzhou Hanlang Environmental Technology Co.,...",chaoxu@zjut.edu.cn.
6,40683206,Annotating risk stratification of thyroid nodu...,2025,,,ilker.sengul.52@gmail.com.
7,40682942,A sweat sensor for simultaneous detection of m...,2025,,State Key Laboratory of Electroanalytical Chem...,qiang.zhang@ciac.ac.cn.
8,40682758,Large language models: unlocking new potential...,2025,,,Cookies_white@outlook.com.
9,40682735,Mediating factors between autoimmune thyroidit...,2025,,,jiangnan6000@126.com.
