In [21]:
import requests
import csv
import re
import argparse
import sys
from typing import List, Dict, Optional

# Constants
PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
PHARMA_KEYWORDS = ["pharma", "biotech", "therapeutics", "laboratories", "biosciences", "inc", "corp", "ltd"]

def fetch_pubmed_papers(query: str) -> List[str]:
    """Fetch PubMed IDs based on the query."""
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": 10  # Limit to 10 for testing; increase as needed
    }
    response = requests.get(PUBMED_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("esearchresult", {}).get("idlist", [])

def fetch_paper_details(pubmed_id: str) -> str:
    """Fetch details of a paper by PubMed ID."""
    params = {
        "db": "pubmed",
        "id": pubmed_id,
        "retmode": "xml"
    }
    response = requests.get(PUBMED_FETCH_URL, params=params)
    response.raise_for_status()
    return response.text  # Raw XML response

def extract_relevant_data(xml_data: str) -> Optional[Dict]:
    """Extract relevant fields from the paper's XML data."""
    try:
        pubmed_id = re.search(r'<PMID.*?>(\d+)</PMID>', xml_data)
        title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', xml_data)
        pub_date = re.search(r'<PubDate>(.*?)</PubDate>', xml_data)

        pubmed_id = pubmed_id.group(1) if pubmed_id else "N/A"
        title = title.group(1) if title else "N/A"
        pub_date = pub_date.group(1) if pub_date else "N/A"

        affiliations = re.findall(r'<Affiliation.*?>(.*?)</Affiliation>', xml_data, re.DOTALL)
        emails = re.findall(r'<EAddress.*?>(.*?)</EAddress>', xml_data)

        corresponding_email = emails[0] if emails else "N/A"
        
        non_academic_authors = []
        company_affiliations = []

        for affiliation in affiliations:
            if any(keyword in affiliation.lower() for keyword in PHARMA_KEYWORDS):
                company_affiliations.append(affiliation)

        if company_affiliations:
            return {
                "PubmedID": pubmed_id,
                "Title": title,
                "Publication Date": pub_date,
                "Company Affiliation(s)": ", ".join(company_affiliations),
                "Corresponding Author Email": corresponding_email
            }
    except Exception as e:
        print(f"Error processing paper: {e}")
        return None

def save_to_csv(papers: List[Dict], filename: str):
    """Save extracted paper data to a CSV file."""
    if not papers:
        print("No relevant papers found.")
        return
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=papers[0].keys())
        writer.writeheader()
        writer.writerows(papers)
    
    print(f"Results saved to {filename}")

def main():
    """Main function to fetch PubMed papers."""
    parser = argparse.ArgumentParser(description="Fetch PubMed papers with non-academic authors")
    parser.add_argument("query", type=str, nargs="?", help="Search query for PubMed")  # Makes query optional
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debug mode")
    parser.add_argument("-f", "--file", type=str, help="Output CSV filename")
    
    # Handle command-line vs. interactive mode
    if "ipykernel" in sys.modules:  # Running in Jupyter
        user_query = input("Enter your PubMed search query: ").strip()
        if not user_query:
            print("No query provided. Exiting.")
            return
        args = parser.parse_args([user_query])  # Simulate command-line input
    else:
        args = parser.parse_args()  # Normal CLI execution

    pubmed_ids = fetch_pubmed_papers(args.query)

    results = []
    for pubmed_id in pubmed_ids:
        xml_data = fetch_paper_details(pubmed_id)
        paper_data = extract_relevant_data(xml_data)
        if paper_data:
            results.append(paper_data)

    if args.file:
        save_to_csv(results, args.file)
        print(f"Results saved to {args.file}")
    else:
        for result in results:
            print(result)

if __name__ == "__main__":
    main()




Enter your PubMed search query:  Cancer research


{'PubmedID': '40053902', 'Title': 'Prospective Evaluation of the Pathologic Diagnosis and Treatment Outcomes of Pediatric Burkitt Lymphoma in the Central American Pediatric Hematology-Oncology Association Consortium.', 'Publication Date': '<Year>2025</Year><Month>Mar</Month>', 'Company Affiliation(s)': '<Affiliation>Department of Pediatric Oncology, Hospital Saint Damien, Port-au-Prince, Haiti.', 'Corresponding Author Email': 'N/A'}
{'PubmedID': '40053901', 'Title': 'Germline Genetic Testing in Breast Cancer: Utilization and Disparities in a Middle-Income Country.', 'Publication Date': '<Year>2025</Year><Month>Mar</Month>', 'Company Affiliation(s)': '<Affiliation>Instituto Nacional do C&#xe2;ncer (INCA), Rio de Janeiro, Brazil.', 'Corresponding Author Email': 'N/A'}
{'PubmedID': '40053899', 'Title': 'Real-World Analysis Evaluating Treatment Eligibility and Outcomes in Patients With AML Receiving Intensive Chemotherapy: Insights From an Underrepresented Population.', 'Publication Date':

In [25]:
user_query = input("Enter your PubMed search query: ").strip()
if user_query:
    pubmed_ids = fetch_pubmed_papers(user_query)

    results = []
    for pubmed_id in pubmed_ids:
        xml_data = fetch_paper_details(pubmed_id)
        paper_data = extract_relevant_data(xml_data)
        if paper_data:
            results.append(paper_data)

    output_filename = "output.csv"  # Change as needed
    save_to_csv(results, output_filename)
    print(f"Results saved to {output_filename}")
else:
    print("No query provided. Exiting.")


Enter your PubMed search query:  Cancer research 


Results saved to output.csv
Results saved to output.csv
