In [1]:
from Bio import Entrez
import pandas as pd
import os
import time
import json
from datetime import datetime
import xml.etree.ElementTree as ET

In [13]:
Entrez.email = "olandechris@gmail.com"

In [None]:
def search_pubmed(query, max_results = 1, date_from = None, date_to = None, sort_order = "relevance", publication_types = None):
    
    search_term = query

    # Add date filters
    if date_from or date_to:
        if date_from and date_to:
            search_term += f' AND {date_from}[PDAT]:{date_to}[PDAT]'
        elif date_from:
            search_term += f' AND {date_from}[PDAT]:3000[PDAT]'
        elif date_to:
            search_term += f' AND 1900[PDAT]:{date_to}[PDAT]'

    # Add publication type filters
    if publication_types:
        pub_filter = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
        search_term += f' AND ({pub_filter})'
    
    print(f"Searching PubMed with query: {search_term}")

    try: 
        # Perform the search
        handle = Entrez.esearch(
            db = "pubmed",
            term=search_term,
            retmax=max_results,
            sort=sort_order
        )

        search_results = Entrez.read(handle)

        handle.close()

        pmids = search_results["IdList"]
        count = int(search_results["Count"])
        
        print(f"Found {count} total articles, retrieving {len(pmids)} IDs")
        return pmids

    except Exception as e:
        print(f"Search error: {e}")
        return []

        

In [6]:
search_pubmed("Kurt Cobain")

Searching PubMed with query: Kurt Cobain
Found 4 total articles, retrieving 1 IDs


['7988166']

In [None]:
https://pubmed.ncbi.nlm.nih.gov/7988166

In [9]:
def parse_article(summary, record):
    """Parse article summary and record into structured data"""
    try:
        # Basic info from summary
        pmid = str(summary.get('Id', ''))
        title = summary.get('Title', '').strip()
        
        # Journal info
        journal = summary.get('Source', '')
        pub_date = summary.get('PubDate', '')
        
        # Authors from summary
        authors_list = summary.get('AuthorList', [])
        authors = '; '.join([author for author in authors_list]) if authors_list else ''
        
        # Extract more details from full record
        article = record['MedlineCitation']['Article']
        
        # Abstract
        abstract = ''
        if 'Abstract' in article:
            abstract_texts = []
            if 'AbstractText' in article['Abstract']:
                for abs_text in article['Abstract']['AbstractText']:
                    if isinstance(abs_text, str):
                        abstract_texts.append(abs_text)
                    else:
                        # Handle structured abstracts with labels
                        abstract_texts.append(str(abs_text))
            abstract = ' '.join(abstract_texts)
        
        # Publication details
        journal_info = article.get('Journal', {})
        journal_title = journal_info.get('Title', journal)
        
        # Volume and issue
        journal_issue = journal_info.get('JournalIssue', {})
        volume = journal_issue.get('Volume', '')
        issue = journal_issue.get('Issue', '')
        
        # Publication date details
        pub_date_info = journal_issue.get('PubDate', {})
        year = pub_date_info.get('Year', '')
        month = pub_date_info.get('Month', '')
        day = pub_date_info.get('Day', '')
        
        # DOI and other identifiers
        doi = ''
        pmc_id = ''
        
        if 'ELocationID' in article:
            for eloc in article['ELocationID']:
                if eloc.attributes.get('EIdType') == 'doi':
                    doi = str(eloc)
                elif eloc.attributes.get('EIdType') == 'pmc':
                    pmc_id = str(eloc)
        
        # Keywords/MeSH terms
        mesh_terms = []
        if 'MeshHeadingList' in record['MedlineCitation']:
            for mesh in record['MedlineCitation']['MeshHeadingList']:
                descriptor = mesh['DescriptorName']
                mesh_terms.append(str(descriptor))
        
        # Publication types
        pub_types = []
        if 'PublicationTypeList' in article:
            pub_types = [str(pt) for pt in article['PublicationTypeList']]
        
        return {
            'pmid': pmid,
            'title': title,
            'abstract': abstract,
            'authors': authors,
            'journal': journal_title,
            'volume': volume,
            'issue': issue,
            'year': year,
            'month': month,
            'day': day,
            'pub_date': pub_date,
            'doi': doi,
            'pmc_id': pmc_id,
            'mesh_terms': '; '.join(mesh_terms),
            'publication_types': '; '.join(pub_types),
            'pubmed_url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            'doi_url': f"https://doi.org/{doi}" if doi else ''
        }
        
    except Exception as e:
        print(f"Error parsing article {pmid}: {e}")
        return None

In [10]:
def fetch_article_details(pmids, batch_size = 100):
    articles = []

    # Process in batches
    for i in range(0, len(pmids), batch_size):
        batch_pmids = pmids[i:i + batch_size]

        print(f"Fetching batch {i//batch_size + 1}/{(len(pmids)-1)//batch_size + 1} "
                  f"({len(batch_pmids)} articles)...")

        try:
            # Fetch article summaries first
            handle = Entrez.esummary(db = "pubmed", id = ",".join(batch_pmids))
            summaries = Entrez.read(handle)
            handle.close()

            # Fetch full abstract records
            handle = Entrez.efetch(
                db = "pubmed",
                id = ",".join(batch_pmids),
                rettype = "medline",
                retmode = "xml"
            )

            records = Entrez.read(handle)
            handle.close()

            # Parse articles
            for summary, record in zip(summaries, records['PubmedArticle']):
                article_data = parse_article(summary, record)
                if article_data:
                    articles.append(article_data)
            
            # Simple rate limiter
            #TODO: Implement the exponential backoff if need be
            time.sleep(0.34) # 3 requests per second

        except Exception as e:
            print(f"Error fetching batch {i//batch_size + 1}: {e}")
            continue
    
    return articles

In [15]:
fetch_article_details(['7988166'])

Fetching batch 1/1 (1 articles)...


[{'pmid': '7988166',
  'title': 'Kurt Cobain.',
  'abstract': '',
  'authors': 'Kienhorst I',
  'journal': 'Crisis',
  'volume': '15',
  'issue': '2',
  'year': '1994',
  'month': '',
  'day': '',
  'pub_date': '1994',
  'doi': '',
  'pmc_id': '',
  'mesh_terms': 'Adolescent; Adult; Europe; Famous Persons; Female; History, 20th Century; Humans; Imitative Behavior; Male; Music; Social Conformity; Suicide; United States; Wounds, Gunshot',
  'publication_types': 'Biography; Historical Article; Journal Article',
  'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/7988166/',
  'doi_url': ''}]