In [1]:
pip install biopython


Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.83-cp39-cp39-macosx_10_9_x86_64.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 3.1 MB/s eta 0:00:01
[?25hCollecting numpy
  Downloading numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 23.1 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy, biopython
Successfully installed biopython-1.83 numpy-1.26.4
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from Bio import Entrez
import json

# Always tell NCBI who you are (your email address)
Entrez.email = 'abvishwakarma@ucdavis.edu'


In [9]:
def search_pubmed(diseases, keywords):
    base_db = 'pubmed'
    results = []

    for disease in diseases:
        for keyword in keywords:
            query = f'({disease}) AND {keyword}[Title/Abstract]'# AND "animal model"[Title/Abstract]'
            handle = Entrez.esearch(db=base_db, term=query, retmax=50)
            record = Entrez.read(handle)
            id_list = record['IdList']
            if id_list:  # Check if id_list is not empty
                results.extend(fetch_details(id_list))
            else:
                print(f"No results found for query: {query}")

    return results


In [7]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    articles = Entrez.read(handle)
    article_details = []

    for article in articles['PubmedArticle']:
        title = article['MedlineCitation']['Article']['ArticleTitle']
        abstract = article.get('MedlineCitation').get('Article').get('Abstract', {}).get('AbstractText', [])
        abstract_text = " ".join(abstract) if isinstance(abstract, list) else abstract
        
        article_info = {
            'Title': title,
            'Abstract': abstract_text
        }
        article_details.append(article_info)

    return article_details


In [12]:
# Advanced Fetch Details

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    articles = Entrez.read(handle)
    article_details = []

    for article in articles['PubmedArticle']:
        title = article['MedlineCitation']['Article']['ArticleTitle']
        abstract_texts = article.get('MedlineCitation', {}).get('Article', {}).get('Abstract', {}).get('AbstractText', [])
        
        # Combine all parts of the abstract into one string if it's split into sections
        abstract = " ".join(str(part) for part in abstract_texts) if isinstance(abstract_texts, list) else str(abstract_texts)

        # Initialize a dictionary to store the extracted details
        extracted_info = {
            'Animal Model': extract_detail(abstract, 'animal model'),
            'Age': extract_detail(abstract, 'age'),
            'Weight': extract_detail(abstract, 'weight'),
            'Sex': extract_detail(abstract, 'sex'),
            'Disease Model': extract_detail(abstract, 'disease model'),
            'Endpoint': extract_detail(abstract, 'endpoint')
        }

        article_info = {
            'Title': title,
            'Abstract': abstract,
            'Details': extracted_info
        }
        article_details.append(article_info)

    return article_details

def extract_detail(abstract, keyword):
    """
    A simple heuristic function to extract information related to a keyword.
    This function searches for the keyword and extracts the subsequent text,
    assuming the data follows the keyword.
    """
    import re
    # Search for keyword and capture the following words, assuming they detail the keyword
    pattern = re.compile(r'\b' + keyword + r'\b\s*[:]?([\w\s,]+)', re.IGNORECASE)
    match = pattern.search(abstract)
    return match.group(1) if match else 'Not specified'


In [13]:
diseases = ['IBS', "Crohn's Disease", 'Celiac Disease']
keywords = ['preclinical', 'rat', 'rodent', 'mice', 'animal', 'pig', 'hamster']

articles = search_pubmed(diseases, keywords)

# Specify the filename where you want to save the JSON data
filename = 'pubmed_results.json'

# Open the file in write mode and write the JSON data
with open(filename, 'w') as f:
    json.dump(articles, f, indent=4)

print(f"Data has been written to {filename}")



Data has been written to pubmed_results.json
