In [10]:
import pandas as pd
import numpy as np
import random
import json
import pprint

In [2]:
from Bio import Entrez

In [3]:
from pymongo import MongoClient
from pprint import pprint

### Get Abstracts from Entrez API

In [4]:
Entrez.email = 'camelliahilker@gmail.com'
abstracts=[]
without_abstract = []

In [5]:
pmids_raw = pd.read_csv('pubmed_result.txt', sep='\n', header=None)
pmids_raw = list(pmids_raw[0])

#### There are over 15,000 IDs in the file, grab a random sample of 2,500

In [6]:
random.seed(42)
pmids = random.sample(pmids_raw, 2500)

In [7]:
handle = Entrez.efetch(db="pubmed", id=','.join(map(str, pmids)),
                       rettype="xml", retmode="text")
records = Entrez.read(handle)

In [18]:
pprint.pprint(records['PubmedArticle'][0])

{'MedlineCitation': {'Article': {'Abstract': {'AbstractText': ['Colonic epithelial cells are covered by thick inner and outer mucus layers. The inner mucus layer is free of commensal microbiota, which contributes to the maintenance of gut homeostasis. In the small intestine, molecules critical for prevention of bacterial invasion into epithelia such as Paneth-cell-derived anti-microbial peptides and regenerating islet-derived 3 (RegIII) family proteins have been identified. Although there are mucus layers providing physical barriers against the large number of microbiota present in the large intestine, the mechanisms that separate bacteria and colonic epithelia are not fully elucidated. Here we show that Ly6/PLAUR domain containing 8 (Lypd8) protein prevents flagellated microbiota invading the colonic epithelia in mice. Lypd8, selectively expressed in epithelial cells at the uppermost layer of the large intestinal gland, was secreted into the lumen and bound flagellated bacteria includ

In [8]:
for pubmed_article in records['PubmedArticle']:
    year = int(str(pubmed_article['MedlineCitation']['DateCompleted']['Year']))
    journal = str(pubmed_article['MedlineCitation']['Article']['Journal']['Title'])
    pmid = int(str(pubmed_article['MedlineCitation']['PMID']))
    article = pubmed_article['MedlineCitation']['Article']
    if 'Abstract' in article:
        abstract = str(article['Abstract']['AbstractText'][0])
        abstracts.append({'PMID':pmid, 'Abstract':abstract, 'Year':year, 'Journal':journal})
    else:
        without_abstract.append(pmid)

In [53]:
len(without_abstract)

0

In [54]:
len(abstracts)

2500

In [55]:
abstracts[0:11]

[{'PMID': 27027293,
  'Abstract': 'Colonic epithelial cells are covered by thick inner and outer mucus layers. The inner mucus layer is free of commensal microbiota, which contributes to the maintenance of gut homeostasis. In the small intestine, molecules critical for prevention of bacterial invasion into epithelia such as Paneth-cell-derived anti-microbial peptides and regenerating islet-derived 3 (RegIII) family proteins have been identified. Although there are mucus layers providing physical barriers against the large number of microbiota present in the large intestine, the mechanisms that separate bacteria and colonic epithelia are not fully elucidated. Here we show that Ly6/PLAUR domain containing 8 (Lypd8) protein prevents flagellated microbiota invading the colonic epithelia in mice. Lypd8, selectively expressed in epithelial cells at the uppermost layer of the large intestinal gland, was secreted into the lumen and bound flagellated bacteria including Proteus mirabilis. In the

In [56]:
#Do not run unless you want to overwrite abstracts.json!
#json.dump(abstracts, open("abstracts.json","w"))

### Select Abstracts By Year

2015-2019. 2014 doesn't have that many articles

In [57]:
a_15 = []
for ind,item in enumerate(abstracts):
    for k,v in abstracts[ind].items():
        if v == 2015:
            a_15.append(item)
            
a_16 = []
for ind,item in enumerate(abstracts):
    for k,v in abstracts[ind].items():
        if v == 2016:
            a_16.append(item)
            
a_17 = []
for ind,item in enumerate(abstracts):
    for k,v in abstracts[ind].items():
        if v == 2017:
            a_17.append(item)
            
a_18 = []
for ind,item in enumerate(abstracts):
    for k,v in abstracts[ind].items():
        if v == 2018:
            a_18.append(item)

a_19 = []
for ind,item in enumerate(abstracts):
    for k,v in abstracts[ind].items():
        if v == 2019:
            a_19.append(item)

In [58]:
print('There are', len(a_15), 'articles from 2015')
json.dump(a_15, open("a_15.json","w"))
print('There are', len(a_16), 'articles from 2016')
json.dump(a_16, open("a_16.json","w"))
print('There are', len(a_17), 'articles from 2017')
json.dump(a_17, open("a_17.json","w"))
print('There are', len(a_18), 'articles from 2018')
json.dump(a_18, open("a_18.json","w"))
print('There are', len(a_19), 'articles from 2019')
json.dump(a_19, open("a_19.json","w"))

There are 220 articles from 2015
There are 463 articles from 2016
There are 622 articles from 2017
There are 631 articles from 2018
There are 547 articles from 2019


### I realized I maybe grabbed the wrong field for my "journal" key. Fixing it here:

In [31]:
#grab id's that correspond to abstracts.json
abstracts = json.load(open("abstracts.json","r"))
ids = [abstracts[i].get('PMID') for i in range(len(abstracts))]

In [32]:
handle = Entrez.efetch(db="pubmed", id=','.join(map(str, ids)),
                       rettype="xml", retmode="text")
records = Entrez.read(handle)

In [33]:
journal_fix = []
for pubmed_article in records['PubmedArticle']:
    journal_abbr = str(pubmed_article['MedlineCitation']['MedlineJournalInfo']['MedlineTA'])
    pmid = int(str(pubmed_article['MedlineCitation']['PMID']))
    journal_fix.append({'PMID':pmid, 'Abbreviation':journal_abbr})

In [34]:
abbr = [journal_fix[i].get('Abbreviation') for i in range(len(journal_fix))]

In [36]:
len(set(abbr))

927