In [1]:
from Bio import Entrez
import pandas as pd
import os
import time
import json
from datetime import datetime
import xml.etree.ElementTree as ET

In [13]:
Entrez.email = "olandechris@gmail.com"

In [None]:
def search_pubmed(query, max_results = 1, date_from = None, date_to = None, sort_order = "relevance", publication_types = None):
    
    search_term = query

    # Add date filters
    if date_from or date_to:
        if date_from and date_to:
            search_term += f' AND {date_from}[PDAT]:{date_to}[PDAT]'
        elif date_from:
            search_term += f' AND {date_from}[PDAT]:3000[PDAT]'
        elif date_to:
            search_term += f' AND 1900[PDAT]:{date_to}[PDAT]'

    # Add publication type filters
    if publication_types:
        pub_filter = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
        search_term += f' AND ({pub_filter})'
    
    print(f"Searching PubMed with query: {search_term}")

    try: 
        # Perform the search
        handle = Entrez.esearch(
            db = "pubmed",
            term=search_term,
            retmax=max_results,
            sort=sort_order
        )

        search_results = Entrez.read(handle)

        handle.close()

        pmids = search_results["IdList"]
        count = int(search_results["Count"])
        
        print(f"Found {count} total articles, retrieving {len(pmids)} IDs")
        return pmids

    except Exception as e:
        print(f"Search error: {e}")
        return []

        

In [6]:
search_pubmed("Kurt Cobain")

Searching PubMed with query: Kurt Cobain
Found 4 total articles, retrieving 1 IDs


['7988166']

In [None]:
https://pubmed.ncbi.nlm.nih.gov/7988166

In [9]:
def parse_article(summary, record):
    """Parse article summary and record into structured data"""
    try:
        # Basic info from summary
        pmid = str(summary.get('Id', ''))
        title = summary.get('Title', '').strip()
        
        # Journal info
        journal = summary.get('Source', '')
        pub_date = summary.get('PubDate', '')
        
        # Authors from summary
        authors_list = summary.get('AuthorList', [])
        authors = '; '.join([author for author in authors_list]) if authors_list else ''
        
        # Extract more details from full record
        article = record['MedlineCitation']['Article']
        
        # Abstract
        abstract = ''
        if 'Abstract' in article:
            abstract_texts = []
            if 'AbstractText' in article['Abstract']:
                for abs_text in article['Abstract']['AbstractText']:
                    if isinstance(abs_text, str):
                        abstract_texts.append(abs_text)
                    else:
                        # Handle structured abstracts with labels
                        abstract_texts.append(str(abs_text))
            abstract = ' '.join(abstract_texts)
        
        # Publication details
        journal_info = article.get('Journal', {})
        journal_title = journal_info.get('Title', journal)
        
        # Volume and issue
        journal_issue = journal_info.get('JournalIssue', {})
        volume = journal_issue.get('Volume', '')
        issue = journal_issue.get('Issue', '')
        
        # Publication date details
        pub_date_info = journal_issue.get('PubDate', {})
        year = pub_date_info.get('Year', '')
        month = pub_date_info.get('Month', '')
        day = pub_date_info.get('Day', '')
        
        # DOI and other identifiers
        doi = ''
        pmc_id = ''
        
        if 'ELocationID' in article:
            for eloc in article['ELocationID']:
                if eloc.attributes.get('EIdType') == 'doi':
                    doi = str(eloc)
                elif eloc.attributes.get('EIdType') == 'pmc':
                    pmc_id = str(eloc)
        
        # Keywords/MeSH terms
        mesh_terms = []
        if 'MeshHeadingList' in record['MedlineCitation']:
            for mesh in record['MedlineCitation']['MeshHeadingList']:
                descriptor = mesh['DescriptorName']
                mesh_terms.append(str(descriptor))
        
        # Publication types
        pub_types = []
        if 'PublicationTypeList' in article:
            pub_types = [str(pt) for pt in article['PublicationTypeList']]
        
        return {
            'pmid': pmid,
            'title': title,
            'abstract': abstract,
            'authors': authors,
            'journal': journal_title,
            'volume': volume,
            'issue': issue,
            'year': year,
            'month': month,
            'day': day,
            'pub_date': pub_date,
            'doi': doi,
            'pmc_id': pmc_id,
            'mesh_terms': '; '.join(mesh_terms),
            'publication_types': '; '.join(pub_types),
            'pubmed_url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            'doi_url': f"https://doi.org/{doi}" if doi else ''
        }
        
    except Exception as e:
        print(f"Error parsing article {pmid}: {e}")
        return None

In [10]:
def fetch_article_details(pmids, batch_size = 100):
    articles = []

    # Process in batches
    for i in range(0, len(pmids), batch_size):
        batch_pmids = pmids[i:i + batch_size]

        print(f"Fetching batch {i//batch_size + 1}/{(len(pmids)-1)//batch_size + 1} "
                  f"({len(batch_pmids)} articles)...")

        try:
            # Fetch article summaries first
            handle = Entrez.esummary(db = "pubmed", id = ",".join(batch_pmids))
            summaries = Entrez.read(handle)
            handle.close()

            # Fetch full abstract records
            handle = Entrez.efetch(
                db = "pubmed",
                id = ",".join(batch_pmids),
                rettype = "medline",
                retmode = "xml"
            )

            records = Entrez.read(handle)
            handle.close()

            # Parse articles
            for summary, record in zip(summaries, records['PubmedArticle']):
                article_data = parse_article(summary, record)
                if article_data:
                    articles.append(article_data)
            
            # Simple rate limiter
            #TODO: Implement the exponential backoff if need be
            time.sleep(0.34) # 3 requests per second

        except Exception as e:
            print(f"Error fetching batch {i//batch_size + 1}: {e}")
            continue
    
    return articles

In [15]:
fetch_article_details(['7988166'])

Fetching batch 1/1 (1 articles)...


[{'pmid': '7988166',
  'title': 'Kurt Cobain.',
  'abstract': '',
  'authors': 'Kienhorst I',
  'journal': 'Crisis',
  'volume': '15',
  'issue': '2',
  'year': '1994',
  'month': '',
  'day': '',
  'pub_date': '1994',
  'doi': '',
  'pmc_id': '',
  'mesh_terms': 'Adolescent; Adult; Europe; Famous Persons; Female; History, 20th Century; Humans; Imitative Behavior; Male; Music; Social Conformity; Suicide; United States; Wounds, Gunshot',
  'publication_types': 'Biography; Historical Article; Journal Article',
  'pubmed_url': 'https://pubmed.ncbi.nlm.nih.gov/7988166/',
  'doi_url': ''}]

In [18]:
def advanced_search(**kwargs):
    """
    Perform advanced search with multiple parameters
    
    Available parameters:
    - query: Main search terms
    - author: Author name
    - journal: Journal name
    - mesh_terms: MeSH terms list
    - title_words: Words that must appear in title
    - abstract_words: Words that must appear in abstract
    - date_from/date_to: Date range
    - publication_types: List of publication types
    - languages: List of languages
    - max_results: Maximum results
    """

    search_parts = []
    # Main query
    if "query" in kwargs:
        search_parts.append(kwargs["query"])

    # Author
    if "author" in kwargs:
        search_parts.append(f'"{kwargs["author"]}"[Author]')
    
    # Journal
    if "journal" in kwargs:
        search_parts.append(f'"{kwargs["journal"]}"[Journal]')

    # Mesh terms
    if "mesh_terms" in kwargs:
        mesh_queries = [f'"{term}"[MeSH Terms]' for term in kwargs['mesh_terms']]
        search_parts.append(f'({" OR ".join(mesh_queries)})')

    # Title words
    if 'title_words' in kwargs:
        title_queries = [f'"{word}"[Title]' for word in kwargs['title_words']]
        search_parts.append(f'({" AND ".join(title_queries)})')
    
    # Abstract words
    if 'abstract_words' in kwargs:
        abstract_queries = [f'"{word}"[Abstract]' for word in kwargs['abstract_words']]
        search_parts.append(f'({" AND ".join(abstract_queries)})')
    
    # Languages
    if 'languages' in kwargs:
        lang_queries = [f'"{lang}"[Language]' for lang in kwargs['languages']]
        search_parts.append(f'({" OR ".join(lang_queries)})')
    
    # Combine all parts
    full_query = ' AND '.join(search_parts)

    return search_pubmed(
            query=full_query,
            max_results=kwargs.get('max_results', 4),
            date_from=kwargs.get('date_from'),
            date_to=kwargs.get('date_to'),
            publication_types=kwargs.get('publication_types')
        )

In [19]:
advanced_search(query = "Kurt Cobain")

Searching PubMed with query: Kurt Cobain
Found 4 total articles, retrieving 4 IDs


['7988166', '8897665', '26445123', '16179336']

In [22]:
articles = fetch_article_details(['7988166', '8897665', '26445123', '16179336'])

Fetching batch 1/1 (4 articles)...


In [35]:
def get_article_statistics(articles):
        """Generate basic statistics about downloaded articles"""
        if not articles:
            return {}
        
        df = pd.DataFrame(articles)
        
        stats = {
            'total_articles': len(articles),
            'articles_with_abstracts': len(df[df['abstract'].str.len() > 0]),
            'date_range': {
                'earliest': df['year'].min(),
                'latest': df['year'].max()
            },
            'top_journals': df['journal'].value_counts().head(10).to_dict(),
            'publication_types': df['publication_types'].value_counts().head(10).to_dict(),
            'articles_per_year': df['year'].value_counts().sort_index().to_dict()
        }
        
        return stats

In [36]:
get_article_statistics(articles)

{'total_articles': 4,
 'articles_with_abstracts': 3,
 'date_range': {'earliest': '1994', 'latest': '2015'},
 'top_journals': {'Crisis': 1,
  'Suicide & life-threatening behavior': 1,
  'Arquivos de neuro-psiquiatria': 1,
  'Archives of suicide research : official journal of the International Academy for Suicide Research': 1},
 'publication_types': {'Biography; Historical Article; Journal Article': 2,
  'Historical Article; Journal Article; Portrait': 1,
  'Case Reports; Journal Article': 1},
 'articles_per_year': {'1994': 1, '1996': 1, '2005': 1, '2015': 1}}

In [21]:
def save_to_csv(articles, filename):
    """Save articles to CSV using pandas"""
    if not articles:
        print("No articles to save")
        return
        
    df = pd.DataFrame(articles)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Saved {len(articles)} articles to {filename}")

def save_to_excel(articles, filename):
    """Save articles to Excel file"""
    if not articles:
        print("No articles to save")
        return
    
    df = pd.DataFrame(articles)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"Saved {len(articles)} articles to {filename}")

def save_to_json(articles, filename):
    """Save articles to JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(articles)} articles to {filename}")

In [24]:
save_to_csv(articles, "Cobain.csv")

Saved 4 articles to Cobain.csv


# The mismatch count returned by the vector store
The vector store says that the number of documents in the graph is 2451, while we uploaded 4833 documents, a suggestion that we might be duplicating things

In [1]:
import json
from langchain_core.documents import Document

documents_path = "output/pmc_chunks/pmc_semantic_chunks.json"

with open(documents_path, "r", encoding = "utf-8") as f:
    data = json.load(f)

loaded_documents_count = len(data.get("documents", []))
print(f"Number of documents (chunks) loaded from JSON: {loaded_documents_count}")

documents = [Document(page_content=doc["content"], metadata=doc["metadata"]) for doc in data.get("documents", [])]
print(f"Length of the 'documents' list after list comprehension: {len(documents)}")

Number of documents (chunks) loaded from JSON: 4833
Length of the 'documents' list after list comprehension: 4833


In [5]:

from collections import defaultdict, Counter

# Group by pmid to see the pmid_seq_num pattern
pmid_groups = defaultdict(list)
doc_ids = []

for doc in documents:
    pmid = doc.metadata.get("pmid", " ")
    seq_num = doc.metadata.get("seq_num", " ")

    doc_id = f"{pmid}_{seq_num}"

    pmid_groups[pmid].append(seq_num)
    doc_ids.append(doc_id)

In [9]:
# Check for duplicates
id_counts = Counter(doc_ids)
duplicates = {doc_id: count for doc_id, count in id_counts.items() if count > 1}
print(f"Total documents: {len(documents)}")
print(f"Unique PMIDs: {len(pmid_groups)}")
print(f"Duplicate pmid_seq_num combinations: {len(duplicates)}")

Total documents: 4833
Unique PMIDs: 2451
Duplicate pmid_seq_num combinations: 2366
