In [1]:
import requests
import xml.etree.ElementTree as ET
import time
import os
from datetime import datetime
import csv
from collections import Counter
import matplotlib.pyplot as plt

def search_pubmed(query, retmax=100, retstart=0):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax={retmax}&retstart={retstart}&retmode=xml"
    response = requests.get(search_url)
    root = ET.fromstring(response.content)
    
    count = int(root.find(".//Count").text)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    
    return ids, count

def fetch_full_text(pmid):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&id={pmid}&rettype=xml&retmode=xml"
    response = requests.get(fetch_url)
    return response.text

def get_all_articles(keyword):
    retmax = 4000
    retstart = 0
    all_ids = []
    
    while True:
        ids, total_count = search_pubmed(keyword, retmax, retstart)
        all_ids.extend(ids)
        
        print(f"Retrieved {len(all_ids)} of {total_count} articles")
        
        if len(all_ids) >= total_count:
            break
        
        retstart += retmax
        time.sleep(0.34)  # To respect NCBI's rate limit of 3 requests per second
    
    return all_ids

def create_folder(keyword):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    folder_name = f"pubmed_{keyword.replace(' ', '_')}_{timestamp}"
    os.makedirs(folder_name, exist_ok=True)
    return folder_name

def parse_xml(xml_string):
    root = ET.fromstring(xml_string)
    article = root.find(".//PubmedArticle")
    
    pmid = article.find(".//PMID").text
    title = article.find(".//ArticleTitle").text
    
    abstract_element = article.find(".//Abstract/AbstractText")
    abstract = abstract_element.text if abstract_element is not None else ""
    
    journal = article.find(".//Journal/Title").text
    pub_date = article.find(".//PubDate")
    year = pub_date.find("Year").text if pub_date.find("Year") is not None else "N/A"
    
    return {
        "PMID": pmid,
        "Title": title,
        "Abstract": abstract,
        "Journal": journal,
        "Year": year
    }

def categorize_study(text):
    method_keywords = {
        'Survey': ['survey', 'questionnaire', 'interview'],
        'Observational': ['cohort', 'case-control', 'cross-sectional', 'observational'],
        'Experimental': ['randomized', 'controlled trial', 'intervention'],
        'Review': ['systematic review', 'meta-analysis', 'literature review'],
        'Qualitative': ['qualitative', 'focus group', 'ethnography', 'phenomenological'],
        'Mixed Methods': ['mixed method', 'multi-method'],
    }
    
    empirical_keywords = ['data', 'analysis', 'sample', 'participant', 'result', 'finding']
    theoretical_keywords = ['framework', 'concept', 'theory', 'philosophical', 'ethics', 'moral']
    
    text = text.lower()
    
    methods = [method for method, keywords in method_keywords.items() 
               if any(keyword in text for keyword in keywords)]
    
    empirical_count = sum(keyword in text for keyword in empirical_keywords)
    theoretical_count = sum(keyword in text for keyword in theoretical_keywords)
    
    study_type = 'Empirical' if empirical_count > theoretical_count else 'Theoretical'
    
    return methods, study_type

def analyze_articles(articles):
    method_counts = Counter()
    type_counts = Counter()
    
    for article in articles:
        text = f"{article['Title']} {article['Abstract']}"
        methods, study_type = categorize_study(text)
        
        if methods:
            method_counts.update(methods)
        else:
            method_counts['Unclassified'] += 1
        
        type_counts[study_type] += 1
    
    return method_counts, type_counts

def plot_results(method_counts, type_counts, save_folder):
    plt.figure(figsize=(12, 6))
    plt.bar(method_counts.keys(), method_counts.values())
    plt.title('Study Methods in Assisted Dying Research')
    plt.xlabel('Method')
    plt.ylabel('Number of Studies')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(os.path.join(save_folder, 'study_methods.png'))
    plt.close()
    
    plt.figure(figsize=(8, 6))
    plt.pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')
    plt.title('Empirical vs Theoretical Studies in Assisted Dying Research')
    plt.savefig(os.path.join(save_folder, 'study_types.png'))
    plt.close()

def main():
    keyword = input("Enter the keyword to search for (e.g., 'assisted dying'): ")
    all_pmids = get_all_articles(keyword)
    
    print(f"\nRetrieved {len(all_pmids)} PMIDs for the keyword '{keyword}'")
    
    save_folder = create_folder(keyword)
    print(f"\nCreated folder: {save_folder}")
    
    csv_file_path = os.path.join(save_folder, "articles_summary.csv")
    articles = []
    
    with open(csv_file_path, "w", newline='', encoding='utf-8') as csvfile:
        fieldnames = ["PMID", "Title", "Abstract", "Journal", "Year", "Methods", "Study Type"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for i, pmid in enumerate(all_pmids, 1):
            xml_content = fetch_full_text(pmid)
            article_info = parse_xml(xml_content)
            
            text = f"{article_info['Title']} {article_info['Abstract']}"
            methods, study_type = categorize_study(text)
            
            article_info['Methods'] = ', '.join(methods) if methods else 'Unclassified'
            article_info['Study Type'] = study_type
            
            writer.writerow(article_info)
            articles.append(article_info)
            
            print(f"Processed article {i} of {len(all_pmids)} (PMID: {pmid})")
            time.sleep(0.34)  # To respect NCBI's rate limit
    
    method_counts, type_counts = analyze_articles(articles)
    plot_results(method_counts, type_counts, save_folder)
    
    print(f"\nProcess completed. Results saved in folder: {save_folder}")
    print(f"CSV file: {csv_file_path}")
    print("Summary plots: study_methods.png and study_types.png")

if __name__ == "__main__":
    main()

Retrieved 3880 of 3880 articles

Retrieved 3880 PMIDs for the keyword 'assisted dying'

Created folder: pubmed_assisted_dying_20240822_121408
Processed article 1 of 3880 (PMID: 39168589)
Processed article 2 of 3880 (PMID: 39167528)
Processed article 3 of 3880 (PMID: 39160544)
Processed article 4 of 3880 (PMID: 39157533)
Processed article 5 of 3880 (PMID: 39157418)
Processed article 6 of 3880 (PMID: 39152645)
Processed article 7 of 3880 (PMID: 39144136)
Processed article 8 of 3880 (PMID: 39143961)
Processed article 9 of 3880 (PMID: 39126283)
Processed article 10 of 3880 (PMID: 39122437)
Processed article 11 of 3880 (PMID: 39122386)
Processed article 12 of 3880 (PMID: 39121499)
Processed article 13 of 3880 (PMID: 39119216)
Processed article 14 of 3880 (PMID: 39117361)
Processed article 15 of 3880 (PMID: 39095146)
Processed article 16 of 3880 (PMID: 39093520)
Processed article 17 of 3880 (PMID: 39087246)
Processed article 18 of 3880 (PMID: 39083816)
Processed article 19 of 3880 (PMID: 390

KeyboardInterrupt: 