# Published Items for the Center for Health AI - Monthly

This takes a list of authors and searches for any items published for the provided month, grabs the proper citation from manubot-cite, and creates a markdown and MS Word document.

This isn't a replacement for a proper data pipeline. Just a demo to see if the selection logic is sound.

- 2021/12/20 First demo (ST)
- 2022/01/18 Fetch pubmed instead of PMC ids (ST)
- 2022/01/19 Added caching to help dev go faster (ST)
- 2022/06/24 Changes for monthly counts (DB)

In [None]:
import calendar
import json
import logging
import os
import subprocess
from datetime import date, datetime
from typing import Dict, List, Union

import manubot
import pandas as pd
import pandoc
import requests
from diskcache import Cache
from manubot.cite.citations import Citations
from ratelimit import RateLimitException, limits, sleep_and_retry

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

BUILD_FOLDER = "_build"

cache = Cache(BUILD_FOLDER)

In [None]:
# Papermill Parameters Cell
# These can be used as arguments via papermill

# Set a year for lookups
THIS_YEAR = 2021

# Set a month for lookups
THIS_MONTH = 6

# Optional NCBI API key
API_KEY = ""

# NCBI API email
API_EMAIL = "dave.bunten@cuanschutz.edu"

# For testing, set a sample size between 0 and 1.0
SAMPLE_FRACTION = 1.0

# Cache expiration in seconds
CACHE_EXPIRE_SECONDS = 60 * 60 * 24  # seconds * minutes * hours = 1 day

In [None]:
BUILD_MARKDOWN_FILENAME = "cites_monthly.md"
BUILD_AUTHOR_COUNT_HBAR_CHART = "author_items_monthly.png"

In [None]:
# if we don't have an API_KEY from parameters
# and an environment variable is set for this
# set the API_KEY to the environment var
if not API_KEY and os.environ.get("NCBI_API_KEY", ""):
    API_KEY = os.environ["NCBI_API_KEY"]

# set rate limit based on whether there's an API_KEY
# based on NCBI requirements
if API_KEY:
    NCBI_RATE_LIMIT = 10
else:
    NCBI_RATE_LIMIT = 3

NCBI_RATE_LIMIT

In [None]:
# create mindate and maxdate based on provided year and month numbers
MONTH_NAME = datetime.strptime(str(THIS_MONTH), "%m").strftime("%B")
_, num_month_days = calendar.monthrange(THIS_YEAR, THIS_MONTH)
MINDATE = date(THIS_YEAR, THIS_MONTH, 1).strftime("%Y/%m/%d")
MAXDATE = date(THIS_YEAR, THIS_MONTH, num_month_days).strftime("%Y/%m/%d")
print("Month range for %s %s: %s - %s" % (MONTH_NAME, THIS_YEAR, MINDATE, MAXDATE))

In [None]:
# will write out to a folder
if not os.path.exists(BUILD_FOLDER):
    os.makedirs(BUILD_FOLDER)

In [None]:
# Read in the authors and their specific search terms
authors_df = pd.read_json("authors.json", orient="index")
authors_df

In [None]:
# for testing it's nice to take a sample
if 0 < SAMPLE_FRACTION < 1.0:
    authors_df = authors_df.sample(frac=SAMPLE_FRACTION)

In [None]:
# convert into a dict
author_records = authors_df.to_dict("index")
author_records

In [None]:
# modify search terms, e.g., contributors are filtered by CU or CCPM
for k, v in author_records.items():

    # Restrict contributors to CU items
    if v["author_type"] == "contributor":
        v[
            "search_term"
        ] = f"({v['search_term']} AND ((\"University of Colorado\") OR (\"Colorado Center for Personalized Medicine\")))"

author_records

In [None]:
@sleep_and_retry
@limits(calls=NCBI_RATE_LIMIT, period=60)
def search_ncbi(
    term: str,
    mindate: str,
    maxdate: str,
    api_key: str = None,
    email: str = API_EMAIL,
) -> List[str]:
    """
    Look up IDs given a search term,
    a beginning year, and an optional API key.

    NCBI asks that we use an API key,
    which increases API calls to 10/minute, instead of 3/minute.

    Returns a list of IDs
    """
    ids = []

    params = {
        "term": term,
        "format": "pmid",
        "tool": "CUAnschutz-Center_for_Health_AI-DEV",
        "email": email,
        "format": "json",
        "retmax": 100,
        "retstart": 0,
        # note: date format is in yyyy/mm/dd
        "mindate": mindate,
        "maxdate": maxdate,
    }

    if api_key:
        params["api_key"] = api_key

    # page through the results until there are no more ids
    while True:
        r = requests.get(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", params
        )
        if r.status_code == 200:
            result = r.json()["esearchresult"]
        else:
            logging.error(f"NCBI returned a status code of {r.status_code}.")
            break

        if len(result["idlist"]) == 0:
            # no more IDs
            break
        else:
            # append the IDs to the results...
            ids = ids + result["idlist"]
            # and move the start chunk up by the size of retmax
            params["retstart"] += params["retmax"]

    return ids

In [None]:
# I would like to do this in parallel, but the deal with NCBI is we agree not to do that
id_dict = {}

for author, v in author_records.items():

    cache_key = f"author-{author}"
    if not cache.get(cache_key):
        logging.info(f"Looking up `{author}` using {v['search_term']}")
        ids = search_ncbi(
            term=v["search_term"], mindate=MINDATE, maxdate=MAXDATE, api_key=API_KEY
        )
        cache.set(cache_key, ids, expire=CACHE_EXPIRE_SECONDS)
    else:
        ids = cache[cache_key]

    for id in ids:
        if not id_dict.get(id):
            # create an empty nested dict
            id_dict[id] = {"authors": []}
        id_dict[id]["authors"].append(author)

How many items found?

In [None]:
len(id_dict)

In [None]:
# useful to get a handle on the shape of the data
doc_count = {}
for k, v in id_dict.items():
    for author in v["authors"]:
        if not doc_count.get(author):
            doc_count[author] = 0
        doc_count[author] += 1

authors_df["doc_count"] = pd.Series(doc_count)
fig = (
    authors_df.sort_index()
    .sort_values(by=["doc_count"], ascending=True, na_position="first")
    .plot(kind="barh", figsize=(14, 12), legend=False, fontsize=14)
    .get_figure()
)
fig.savefig(os.path.join(BUILD_FOLDER, BUILD_AUTHOR_COUNT_HBAR_CHART))

In [None]:
@sleep_and_retry
@limits(calls=NCBI_RATE_LIMIT, period=60)
def fetch_csljson(id: str) -> Dict:
    """
    Given an ID, fetch the csljson using manubot, which is of course using NCBI, so rate limited.
    """
    logging.info(f"Fetching csljson for {id}")
    citations = Citations([id])
    return citations.get_csl_items()[0]

In [None]:
# Retrive the csljson from manubot
keys = [k for k in id_dict]

cites = []
for key in keys:
    cache_key = f"csljson-{key}"

    if not cache.get(cache_key):
        csljson = fetch_csljson(key)
        cache.set(cache_key, csljson, expire=CACHE_EXPIRE_SECONDS)
    else:
        csljson = cache[cache_key]

    cites.append(csljson)

In [None]:
# I'm going to want to sort these later.
for rec in cites:
    key = rec["PMID"]

    id_dict[key]["csljson"] = rec
    id_dict[key]["title"] = rec["title"].strip()

    # all this for the date!
    if rec.get("issued"):
        issued_date_parts = rec["issued"]["date-parts"][0]
        date_str = str(issued_date_parts[0])
        try:
            date_str += f"/{issued_date_parts[1]}"
            try:
                date_str += f"/{issued_date_parts[2]}"
            except:
                pass
        except:
            pass

        id_dict[key]["issued_date"] = date_str

In [None]:
# sort the dictionary by title
df = pd.DataFrame.from_dict(id_dict, orient="index")
# The filter date in the search isn't necessarily represented in the issue date in the CSLJSON.
df["year"] = THIS_YEAR
df["month"] = THIS_MONTH
df.sort_values(by="title", inplace=True)
df

In [None]:
@sleep_and_retry
@limits(calls=NCBI_RATE_LIMIT, period=60)
def get_markdown(id: str) -> Dict[str, str]:
    """
    This version gets a single cite per ID.

    Manubot is also calling on NCBI, so rate-limiting applies.
    """
    csl_path = os.path.abspath("manubot-style-title-case.csl")
    csl_path = csl_path.replace("\\", "/")

    logging.info(f"Creating cite reference for {id}")

    # Use manubot-cite to fetch the formatted citation
    args = [
        "manubot",
        "cite",
        "--format=markdown",
        f"--csl={csl_path}",
        id,
    ]

    process = subprocess.run(
        args=args,
        encoding="utf-8",
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    assert process.returncode == 0, process.stderr

    return {
        "id": id,
        "cite": process.stdout,
    }

In [None]:
# create a list of pubmed ids for manubot-cite
ids = [f"pubmed:{id}" for id in df.index.to_list()]

# get cites, admittedly the long way round.
my_list = []
for id in df.index.to_list():
    cache_key = f"md-{id}"

    # look it up in the cache
    if not cache.get(cache_key):
        md = get_markdown(id)
        cache.set(cache_key, md, expire=CACHE_EXPIRE_SECONDS)
    else:
        md = cache[cache_key]

    my_list.append(md)

Build up the markdown

In [None]:
image_path = os.path.abspath(os.path.join(BUILD_FOLDER, BUILD_AUTHOR_COUNT_HBAR_CHART))

# Windows paths. Switch the slashes over if windows
image_path = image_path.replace("\\", "/")

with open(
    os.path.join(BUILD_FOLDER, BUILD_MARKDOWN_FILENAME), "w", encoding="utf-8"
) as f:
    f.write(f"# Center for Health AI, Published Items • _{MONTH_NAME} {THIS_YEAR}_\n\n")

    f.write(f"## Author Item Counts\n\n")
    f.write(
        f'![Horizontal bar chart showing author names and their citation count.]({image_path} "Author Citation Count")\n\n'
    )

    f.write(f"## Published Items Citations\n\n")

    # In the custom CSL, I don't include the citation number.
    # This is just a numbered list now.
    for i in range(len(my_list)):
        rec = my_list[i]
        f.write(f"{i+1}. {rec['cite']}")
        f.write("\n")

In [None]:
# convert markdown to Microsoft Word
args = [
    "pandoc",
    "-s",
    os.path.join(BUILD_FOLDER, BUILD_MARKDOWN_FILENAME),
    "-o",
    os.path.join(BUILD_FOLDER, BUILD_MARKDOWN_FILENAME.replace(".md", ".docx")),
]

process = subprocess.run(
    args=args,
    encoding="utf-8",
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

assert process.returncode == 0

In [None]:
cache.close()