# Published Items for the Center for Health AI

This takes a list of authors and searches for any items published this year, grabs the proper citation from manubot-cite, and creates a markdown and MS Word document.

This isn't a replacement for a proper data pipeline. Just a demo to see if the selection logic is sound.

2021/12/20 stephen.taylor@cuanschutz.edu First demo

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
import json
import logging
import manubot
import requests
import pandas as pd
import pandoc
import datetime

from manubot.cite.citations import Citations

from datetime import date
from ratelimit import limits, RateLimitException, sleep_and_retry

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

<IPython.core.display.Javascript object>

In [22]:
# These can be used as arguments via papermill
THIS_YEAR = 2021
BUILD_FOLDER = "_build"
API_KEY = ""

<IPython.core.display.Javascript object>

In [23]:
# We get faster NCBI rates if we use a key.
if API_KEY:
    NCBI_RATE_LIMIT = 10
else:
    NCBI_RATE_LIMIT = 3

NCBI_RATE_LIMIT

3

<IPython.core.display.Javascript object>

In [3]:
# will write out to a folder
if not os.path.exists(BUILD_FOLDER):
    os.makedirs(BUILD_FOLDER)

<IPython.core.display.Javascript object>

In [4]:
# Read in the authors and their specific search terms
authors_df = pd.read_json("authors.json", orient="index")
authors_df

Unnamed: 0,author_type,search_term
Aquilante C,contributor,(Aquilante C[Author])
Barnes K,contributor,(Barnes K[Author])
Boorgula M,contributor,(Boorgula M[Author])
Brooks I,contributor,(Brooks I[Author])
Campbell M,contributor,(Campbell M[Author])
Casey Greene,pi,(Casey Greene)
Claw K,contributor,(Claw K[Author])
Cohen M,contributor,(Cohen M[Author])
Coors M,contributor,(Coors M[Author])
Crooks K,contributor,(Crooks K[Author])


<IPython.core.display.Javascript object>

In [5]:
# for testing it's nice to take a sample
# authors_df = authors_df.sample(frac=0.10)

<IPython.core.display.Javascript object>

In [6]:
# convert into a dict
author_records = authors_df.to_dict("index")
author_records

{'Aquilante C': {'author_type': 'contributor',
  'search_term': '(Aquilante C[Author])'},
 'Barnes K': {'author_type': 'contributor',
  'search_term': '(Barnes K[Author])'},
 'Boorgula M': {'author_type': 'contributor',
  'search_term': '(Boorgula M[Author])'},
 'Brooks I': {'author_type': 'contributor',
  'search_term': '(Brooks I[Author])'},
 'Campbell M': {'author_type': 'contributor',
  'search_term': '(Campbell M[Author])'},
 'Casey Greene': {'author_type': 'pi', 'search_term': '(Casey Greene)'},
 'Claw K': {'author_type': 'contributor', 'search_term': '(Claw K[Author])'},
 'Cohen M': {'author_type': 'contributor', 'search_term': '(Cohen M[Author])'},
 'Coors M': {'author_type': 'contributor', 'search_term': '(Coors M[Author])'},
 'Crooks K': {'author_type': 'contributor',
  'search_term': '(Crooks K[Author])'},
 'Fingerlin T': {'author_type': 'contributor',
  'search_term': '(Fingerlin T[Author])'},
 'Fishbein L': {'author_type': 'contributor',
  'search_term': '(Fishbein L[Autho

<IPython.core.display.Javascript object>

In [7]:
# add a date to restrict to this year
for k, v in author_records.items():
    # Restrict contributors to CU items
    if v["author_type"] == "contributor":
        v[
            "search_term"
        ] = f"({v['search_term']} AND ((\"University of Colorado\") OR (\"Colorado Center for Personalized Medicine\")))"

    # filter by date
    v[
        "search_term"
    ] = f"""("{THIS_YEAR}/01/01"[PubDate] : "3000"[PubDate]) AND {v["search_term"]})"""
author_records

{'Aquilante C': {'author_type': 'contributor',
  'search_term': '("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Aquilante C[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))'},
 'Barnes K': {'author_type': 'contributor',
  'search_term': '("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Barnes K[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))'},
 'Boorgula M': {'author_type': 'contributor',
  'search_term': '("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Boorgula M[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))'},
 'Brooks I': {'author_type': 'contributor',
  'search_term': '("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Brooks I[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))'},
 'Campbell M': {'author_type': 'contributor',
  'search_term': '("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Campbell M[Author]

<IPython.core.display.Javascript object>

In [8]:
@sleep_and_retry
@limits(calls=NCBI_RATE_LIMIT, period=60)
def lookup_ncbi(term, year, api_key=None):
    # look up IDs a chunk at a time
    # NCBI asks that we use an API key... I've used mine in the past, but
    # pulled this out for source.
    ids = []

    d = date(year, 1, 1)
    delta = date.today() - d

    params = {
        "term": term,
        "format": "pmid",
        "db": "pmc",
        "tool": "CUAnschutz-Center_for_Health_AI-DEV",
        "email": "Stephen.Taylor@cuanschutz.edu",
        "format": "json",
        "retmax": 100,
        "retstart": 0,
        "reldate": delta.days,
    }

    if api_key:
        params["api_key"] = api_key

    # page through the results until there are no more ids
    while True:
        r = requests.get(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", params
        )
        if r.status_code == 200:
            result = r.json()["esearchresult"]
        else:
            logging.error(f"NCBI returned a status code of {r.status_code}.")
            break

        if len(result["idlist"]) == 0:
            # no more IDs
            break
        else:
            # append the IDs to the results...
            ids = ids + result["idlist"]
            # and move the start chunk up by the size of retmax
            params["retstart"] += params["retmax"]

    return ids


# quick test
# lookup_ncbi(author_records["Gregory Way"]["search_term"], THIS_YEAR)

<IPython.core.display.Javascript object>

In [9]:
# I would like to do this in parallel, but the deal with NCBI is we agree not to do that
id_dict = {}
for author, v in author_records.items():
    logging.info(f"Looking up `{author}` using {v['search_term']}")
    ids = lookup_ncbi(
        v["search_term"], THIS_YEAR, api_key="6f104848ae7ff47f67a69b6b0df250392608"
    )

    for id in ids:
        # adding the PMC here to make it more usable against the manubot cites later
        pmc_id = f"PMC{id}"
        if not id_dict.get(pmc_id):
            # create an empty nested dict
            id_dict[pmc_id] = {"authors": []}
        id_dict[pmc_id]["authors"].append(author)

id_dict

INFO:root:Looking up `Aquilante C` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Aquilante C[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))
INFO:root:Looking up `Barnes K` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Barnes K[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))
INFO:root:Looking up `Boorgula M` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Boorgula M[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))
INFO:root:Looking up `Brooks I` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Brooks I[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))
INFO:root:Looking up `Campbell M` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Campbell M[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))
INFO:root:Looking up `Casey Greene` using ("20

INFO:root:Looking up `Yang I` using ("2021/01/01"[PubDate] : "3000"[PubDate]) AND ((Yang I[Author]) AND (("University of Colorado") OR ("Colorado Center for Personalized Medicine"))))


{'PMC8504805': {'authors': ['Aquilante C', 'Gignoux C', 'Shortt J']},
 'PMC8047196': {'authors': ['Aquilante C']},
 'PMC8485147': {'authors': ['Barnes K', 'Mathias R']},
 'PMC8095117': {'authors': ['Barnes K', 'Campbell M', 'Crooks K']},
 'PMC7875770': {'authors': ['Barnes K',
   'Fingerlin T',
   'Lange L',
   'Mathias R']},
 'PMC7817517': {'authors': ['Brooks I']},
 'PMC8082311': {'authors': ['Campbell M']},
 'PMC8613500': {'authors': ['Casey Greene',
   'Melissa Haendel',
   'Tellen Bennett']},
 'PMC8562484': {'authors': ['Casey Greene']},
 'PMC8547481': {'authors': ['Casey Greene']},
 'PMC8525744': {'authors': ['Casey Greene', 'Gregory Way']},
 'PMC8458035': {'authors': ['Casey Greene']},
 'PMC8452106': {'authors': ['Casey Greene']},
 'PMC8415599': {'authors': ['Casey Greene']},
 'PMC8385897': {'authors': ['Casey Greene']},
 'PMC8340473': {'authors': ['Casey Greene']},
 'PMC8269209': {'authors': ['Casey Greene']},
 'PMC8260469': {'authors': ['Casey Greene', 'Taylor M']},
 'PMC81680

<IPython.core.display.Javascript object>

How many items found?

In [10]:
len(id_dict)

150

<IPython.core.display.Javascript object>

In [11]:
# Retrive the references from manubot
pmc_keys = [f"pmc:{key}" for key in id_dict]
citations = Citations(pmc_keys)
cites = citations.get_csl_items()
cites

[{'source': 'PubMed',
  'accessed': {'date-parts': [[2021, 12, 22]]},
  'id': '98nO0bph',
  'title': 'The trans-ancestral genomic architecture of glycemic traits',
  'author': [{'family': 'Chen', 'given': 'Ji'},
   {'family': 'Spracklen', 'given': 'Cassandra N'},
   {'family': 'Marenne', 'given': 'Gaëlle'},
   {'family': 'Varshney', 'given': 'Arushi'},
   {'family': 'Corbin', 'given': 'Laura J'},
   {'family': 'Luan', 'given': "Jian'an"},
   {'family': 'Willems', 'given': 'Sara M'},
   {'family': 'Wu', 'given': 'Ying'},
   {'family': 'Zhang', 'given': 'Xiaoshuai'},
   {'family': 'Horikoshi', 'given': 'Momoko'},
   {'family': 'Boutin', 'given': 'Thibaud S'},
   {'family': 'Mägi', 'given': 'Reedik'},
   {'family': 'Waage', 'given': 'Johannes'},
   {'family': 'Li-Gao', 'given': 'Ruifang'},
   {'family': 'Chan', 'given': 'Kei Hang Katie'},
   {'family': 'Yao', 'given': 'Jie'},
   {'family': 'Anasanti', 'given': 'Mila D'},
   {'family': 'Chu', 'given': 'Audrey Y'},
   {'family': 'Claringbou

<IPython.core.display.Javascript object>

In [12]:
# I'm going to want to sort these later.
for rec in cites:
    key = rec["PMCID"]

    id_dict[key]["csljson"] = rec
    id_dict[key]["title"] = rec["title"]

    # all this for the date!
    issued_date_parts = rec["issued"]["date-parts"][0]
    date_str = str(issued_date_parts[0])
    try:
        date_str += f"/{issued_date_parts[1]}"
        try:
            date_str += f"/{issued_date_parts[2]}"
        except:
            pass
    except:
        pass

    id_dict[key]["issued_date"] = date_str

id_dict

{'PMC8504805': {'authors': ['Aquilante C', 'Gignoux C', 'Shortt J'],
  'csljson': {'source': 'PubMed',
   'accessed': {'date-parts': [[2021, 12, 22]]},
   'id': '17LkR5S2s',
   'title': 'Applicability of ancestral genotyping in pharmacogenomic research with hormonal contraception',
   'author': [{'family': 'Lazorwitz', 'given': 'Aaron'},
    {'family': 'Aquilante', 'given': 'Christina L'},
    {'family': 'Shortt', 'given': 'Jonathan A'},
    {'family': 'Sheeder', 'given': 'Jeanelle'},
    {'family': 'Teal', 'given': 'Stephanie'},
    {'family': 'Gignoux', 'given': 'Christopher R'}],
   'container-title-short': 'Clin Transl Sci',
   'container-title': 'Clinical and translational science',
   'publisher': 'John Wiley and Sons Inc.',
   'ISSN': '1752-8054',
   'issued': {'date-parts': [[2021, 9]]},
   'page': '1713-1718',
   'volume': '14',
   'issue': '5',
   'PMID': '33650294',
   'PMCID': 'PMC8504805',
   'DOI': '10.1111/cts.13014',
   'type': 'article-journal',
   'URL': 'https://www.

<IPython.core.display.Javascript object>

In [13]:
# sort the dictionary by title
df = pd.DataFrame.from_dict(id_dict, orient="index")
df.sort_values(by="title", inplace=True)
df

Unnamed: 0,authors,csljson,title,issued_date
PMC8399445,"[Johnson RK, Kechris K, Yang I]","{'source': 'PubMed', 'accessed': {'date-parts'...",A Mediation Approach to Discovering Causal Rel...,2021/8/14
PMC7859930,[Kechris K],"{'source': 'PubMed', 'accessed': {'date-parts'...",A Practical Guide to Metabolomics Software Dev...,2021/2/2
PMC8485147,"[Barnes K, Mathias R]","{'source': 'PubMed', 'accessed': {'date-parts'...",A System for Phenotype Harmonization in the Na...,2021/10/1
PMC8525744,"[Casey Greene, Gregory Way]","{'source': 'PubMed', 'accessed': {'date-parts'...",A field guide to cultivating computational bio...,2021/10/7
PMC8492620,[Ghosh D],"{'source': 'PubMed', 'accessed': {'date-parts'...",A novel approach to understanding Parkinsonian...,2021/10/5
...,...,...,...,...
PMC8206199,"[Lange L, Mathias R]","{'source': 'PubMed', 'accessed': {'date-parts'...",Whole-genome sequencing association analysis o...,2021/5/6
PMC8095117,"[Barnes K, Campbell M, Crooks K]","{'source': 'PubMed', 'accessed': {'date-parts'...",Zika Virus Congenital Syndrome and <i>MTOR</i>...,2021/4/22
PMC7822150,[Taylor M],"{'source': 'PubMed', 'accessed': {'date-parts'...",cAMP-Independent Activation of the Unfolded Pr...,2021/1/19
PMC8415599,[Casey Greene],"{'source': 'PubMed', 'accessed': {'date-parts'...",miQC: An adaptive probabilistic framework for ...,2021/8/24


<IPython.core.display.Javascript object>

In [14]:
# dump out the csljson for pandoc
csljson = []
for row, columns in df.iterrows():
    csljson.append(columns["csljson"])
csljson

[{'source': 'PubMed',
  'accessed': {'date-parts': [[2021, 12, 22]]},
  'id': 'prtwvW2e',
  'title': 'A Mediation Approach to Discovering Causal Relationships between the Metabolome and DNA Methylation in Type 1 Diabetes',
  'author': [{'family': 'Vigers', 'given': 'Tim'},
   {'family': 'Vanderlinden', 'given': 'Lauren A'},
   {'family': 'Johnson', 'given': 'Randi K'},
   {'family': 'Carry', 'given': 'Patrick M'},
   {'family': 'Yang', 'given': 'Ivana'},
   {'family': 'DeFelice', 'given': 'Brian C'},
   {'family': 'Kaizer', 'given': 'Alexander M'},
   {'family': 'Pyle', 'given': 'Laura'},
   {'family': 'Rewers', 'given': 'Marian'},
   {'family': 'Fiehn', 'given': 'Oliver'},
   {'family': 'Norris', 'given': 'Jill M'},
   {'family': 'Kechris', 'given': 'Katerina'}],
  'container-title-short': 'Metabolites',
  'container-title': 'Metabolites',
  'publisher': 'MDPI',
  'ISSN': '2218-1989',
  'issued': {'date-parts': [[2021, 8, 14]]},
  'page': '542',
  'volume': '11',
  'issue': '8',
  'PM

<IPython.core.display.Javascript object>

Build up the markdown

In [15]:
# read the csljson
current_path = os.getcwd()
doc = pandoc.read(
    source=json.dumps(csljson),
    format="csljson",
    options=[
        "--citeproc",
        "--csl=manubot-style.csl",
        f"--resource-path={current_path}",
    ],
)
doc

Pandoc(Meta({'csl': MetaString('manubot-style.csl'), 'nocite': MetaInlines([Cite([Citation('*', [], [], NormalCitation(), 0, 0)], [Str('[@*]')])]), 'references': MetaList([MetaMap({'DOI': MetaInlines([Str('10.3390/metabo11080542')]), 'ISSN': MetaInlines([Str('2218-1989')]), 'PMCID': MetaInlines([Str('PMC8399445')]), 'PMID': MetaInlines([Str('34436483')]), 'URL': MetaInlines([Str('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8399445/')]), 'accessed': MetaString('2021-12-22'), 'author': MetaList([MetaMap({'family': MetaString('Vigers'), 'given': MetaString('Tim')}), MetaMap({'family': MetaString('Vanderlinden'), 'given': MetaString('Lauren A')}), MetaMap({'family': MetaString('Johnson'), 'given': MetaString('Randi K')}), MetaMap({'family': MetaString('Carry'), 'given': MetaString('Patrick M')}), MetaMap({'family': MetaString('Yang'), 'given': MetaString('Ivana')}), MetaMap({'family': MetaString('DeFelice'), 'given': MetaString('Brian C')}), MetaMap({'family': MetaString('Kaizer'), 'given

<IPython.core.display.Javascript object>

Finally output the content to a markdown file

In [16]:
# write the markdown
with open(os.path.join(BUILD_FOLDER, "cites.md"), "w", encoding="utf-8") as f:
    _ = f.write(
        pandoc.write(
            doc,
            format="markdown",
            options=["--to=markdown_strict-raw_html", "--wrap=none", "--eol=lf"],
        )
    )

<IPython.core.display.Javascript object>

In [17]:
# Convert the markdown to docx
doc = pandoc.read(file=os.path.join(BUILD_FOLDER, "cites.md"), format="markdown")
with open(os.path.join(BUILD_FOLDER, "cites.docx"), "wb") as f:
    _ = f.write(pandoc.write(doc, format="docx"))

<IPython.core.display.Javascript object>