# BRCA Exchange Literature Search Export

Ingest artifacts from the pipeline, wrangle, and generate literature.json

In [2]:
import os
import sys
import datetime
import json
import sqlite3
import pprint
import pandas as pd

os.chdir(os.path.expanduser("/home/jovyan/data/pubmunch/crawl/"))

## Ingest
Ingest artifacts from various pieces of the pipeline indexed by normalized genomic hgvs and integer pmid 

In [3]:
connection = sqlite3.connect("file:text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
articles.head()

160 articles loaded from the articles sqlite database


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5018824701,PMID18824701,,download,,Journal of clinical oncology : official journa...,0732-183X,1527-7755,8309333,2008,...,33,5393,18824701,2651073.0,10.1200/JCO.2008.17.8228,,2019-03-19T20:57:21+0000,78,127342,0_00000
1,5012473589,PMID12473589,,download,,Clinical cancer research : an official journal...,1078-0432,1078-0432,9502500,2002,...,12,3776,12473589,,,,2019-03-19T20:57:51+0000,1155332769,73030,0_00000
2,5021673748,PMID21673748,,download,,European journal of human genetics : EJHG,1018-4813,1476-5438,9302235,2011,...,10,1052,21673748,3190263.0,10.1038/ejhg.2011.100,,2019-03-19T20:58:12+0000,4038201791,88784,0_00000
3,5017493881,PMID17493881,,download,,Biochimica et biophysica acta,0006-3002,0006-3002,217513,2007,...,6,772,17493881,,10.1016/j.bbapap.2007.03.018,,2019-03-19T20:58:19+0000,5509507500,4372,0_00000
4,5009971877,PMID9971877,,download,,Human molecular genetics,0964-6906,0964-6906,9208958,1999,...,3,413,9971877,,10.1002/(sici)1097-0215(19980729)77:3<354::aid...,,2019-03-19T20:58:25+0000,5509510743,60799,0_00000


In [4]:
mentions = pd.read_csv("mentions-matched.tsv", sep="\t", encoding="utf-8")
mentions.pmid = mentions.pmid.astype(str)
print("Total matched mentions: {}".format(mentions.shape[0]))
mentions = mentions.drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "pmid", "snippets"])
print("After dropping duplicates: {}".format(mentions.shape[0]))
mentions.head()

Total matched mentions: 10426
After dropping duplicates: 5671


Unnamed: 0,pyhgvs_Genomic_Coordinate_38,pmid,snippets,points
0,chr17:g.43092632:T>A,10373534,"1051-1863, using a Clontech transformer site-d...",10
1,chr17:g.43092632:T>A,10373534,"1051-1863, using a Clontech transformer site-...",10
2,chr17:g.43076488:C>A,10373534,A mutation in close proximity has been compile...,10
6,chr17:g.43074517:A>C,10373534,Mutants<<< S1497A>>> and S1497T were generated...,10
7,chr17:g.43074517:A>C,10373534,Mutants<<< S1497A>>> and S1497T were generated...,10


In [5]:
def top_papers(mentions, pyhgvs):
    """ Return (pmid, points) tuple sorted by points against pyhgvs """
    top = mentions[mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs] \
        .groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]) \
        .agg({"points": sum}) \
        .sort_values("points", ascending=False)
    return top.reset_index()[["pmid", "points"]].values

top_papers(mentions, "chr13:g.32363367:C>G")

array([['22962691', 40],
       ['18424508', 30],
       ['20215541', 30],
       ['18451181', 20],
       ['18607349', 20],
       ['20507642', 20],
       ['20522429', 20],
       ['12145750', 10],
       ['16792514', 10],
       ['17899372', 10],
       ['17924331', 10],
       ['12915465', 2]], dtype=object)

In [6]:
def top_snippets(mentions, pyhgvs, pmid):
    """ Return list of top snippets for this variant and paper by points """
    paper = mentions[(mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs) & (mentions.pmid == pmid)]
    return [snippet 
     for snippets in paper.sort_values("points", ascending=False).snippets.values 
     for snippet in snippets.split("|")][:3]

top_snippets(mentions, "chr13:g.32363367:C>G", '18424508')

['BRCA2<<< T2722R>>> is a deleterious allele that causes exon skipping.',
 'Of these, the exonic variant BRCA2 c.8162T→C in exon 18 affects a position only three nucleotides upstream of the mutation BRCA2 c.8165C→G (predicting<<< p.T2722R>>>), which is known to cause exon skipping by disrupting several ESE sites.20  Two variants, BRCA2 c.316+5G→C (IVS3+5G→C) and c.7805G→C, at the last base of exon 16, induced strong effects on splicing (figs 2A,B and 2C,D, respectively).',
 'Of these, the exonic variant BRCA2 c.8162TRC in exon 18 affects a position only three nucleotides upstream of the mutation BRCA2<<< c.8165CRG>>> (predicting p.T2722R), which is known to cause exon skipping by disrupting several ESE sites.20 Two variants, BRCA2 c.316+5GRC (IVS3+5GRC) and c.7805']

In [7]:
def top_papers_and_snippets(mentions, pyhgvs):
    return [{"pmid": str(p[0]), "points": int(p[1]), "mentions": top_snippets(mentions, pyhgvs, p[0])}
            for p in top_papers(mentions, pyhgvs)]

# for pyhgvs in ["chr13:g.32363367:C>G", "chr17:g.43124027:ACT>A", "chr13:g.32340526:AT>A"]:
#     print(pyhgvs)
#     pprint.pprint(top_papers_and_snippets(mentions, pyhgvs)[:3])

## Export

In [8]:
variants = {}

remaining = mentions.pyhgvs_Genomic_Coordinate_38.unique().shape[0]
print(datetime.datetime.now())
for pyhgvs in mentions.pyhgvs_Genomic_Coordinate_38.unique():
    sys.stdout.write("Remaining: {}\r".format(remaining))
    sys.stdout.flush()
    variants[pyhgvs] = top_papers_and_snippets(mentions, pyhgvs)
    remaining -= 1

2019-03-19 23:06:08.507579
Remaining: 1000

In [9]:
lit = {
    "date": open("pubs-date.txt").read().strip(),
    "papers": articles[articles.pmid.isin(mentions.pmid)].set_index("pmid", drop=False).to_dict(orient="index"),
    "variants": variants
}

with open("literature.json", "w") as output:
    output.write(json.dumps(lit, sort_keys=True))
    
print("Exported {} variants in {} papers".format(
    len(lit["variants"].keys()), len(lit["papers"].keys())))

with open("literature.json") as f:
    lit = json.loads(f.read())
print("{} Papers and {} Variants exported".format(len(lit["papers"]), len(lit["variants"])))

Exported 1966 variants in 150 papers
150 Papers and 1966 Variants exported


In [10]:
pprint.pprint(lit["variants"]["chr13:g.32363367:C>G"])

[{'mentions': [', in the BRCA2 gene, besides the exon 7 mutations described '
               'above, only one mutation in exon 3, c.231T>G (p.Thr77Thr),16 '
               'and two mutations in exon 18,<<< c.8165C>G>>> (p.Thr2722Arg)28 '
               'and c.7992T>A (p.Ile2664Ile),16 have been reported to induce '
               'exon skipping by altering splicing regulatory elements.',
               'BRCA2<<< T2722R>>> is a deleterious allele that causes exon '
               'skipping.',
               'CA2 gene, besides the exon 7 mutations described above, only '
               'one mutation in exon 3, c.231T>G (p.Thr77Thr),16 and two '
               'mutations in exon 18, c.8165C>G <<<(p.Thr2722Arg>>>)28 and '
               'c.7992T>A (p.Ile2664Ile),16 have been reported to induce exon '
               'skipping by altering splicing regulatory elements.'],
  'pmid': '22962691',
  'points': 40},
 {'mentions': ['BRCA2<<< T2722R>>> is a deleterious allele that causes exon '
     