# BRCA Exchange Literature Search Export

Ingest artifacts from the pipeline, wrangle, and generate literature.json

In [23]:
import os
import sys
import datetime
import json
import sqlite3
import pprint
import pandas as pd

os.chdir(os.path.expanduser("/home/jovyan/data/pubmunch/crawl/"))

## Ingest
Ingest artifacts from various pieces of the pipeline indexed by normalized genomic hgvs and integer pmid 

In [2]:
connection = sqlite3.connect("file:text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
articles.head()

14044 articles loaded from the articles sqlite database


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5011585672,PMID11585672,,download,,Trends in genetics : TIG,0168-9525,0168-9525,8507085,2001,...,10,S18,11585672,,10.1016/s0168-9525(01)02451-9,https://linkinghub.elsevier.com/retrieve/pii/S...,2018-11-14T16:29:09+0000,78,5258,0_00000
1,5019688261,PMID19688261,,download,,Breast cancer research and treatment,0167-6806,1573-7217,8111104,2010,...,3,575,19688261,,10.1007/s10549-009-0501-3,,2018-11-14T16:29:16+0000,5048,100825,0_00000
2,5012228710,PMID12228710,,download,,"Science (New York, N.Y.)",0036-8075,1095-9203,404511,2002,...,5588,1837,12228710,,10.1126/science.297.5588.1837,,2018-11-14T16:29:40+0000,1436851129,148950,0_00000
3,5029369605,PMID29369605,,download,,Genetika,0016-6758,0016-6758,47354,2016,...,10,1215,29369605,,10.1134/s102279541609012x,https://link.springer.com/article/10.1134%2FS1...,2018-11-14T16:30:43+0000,4458079378,22264,0_00000
4,5022084640,PMID22084640,,download,,Therapeutic advances in medical oncology,1758-8340,1758-8359,101510808,2011,...,6,257,22084640,3210467.0,10.1177/1758834011417039,,2018-11-14T16:30:58+0000,6079002966,93229,0_00000


In [3]:
mentions = pd.read_csv("mentions-matched.tsv", sep="\t", encoding="utf-8")
mentions.pmid = mentions.pmid.astype(str)
print("Total matched mentions: {}".format(mentions.shape[0]))
mentions = mentions.drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "pmid", "snippets"])
print("After dropping duplicates: {}".format(mentions.shape[0]))
mentions.head()

Total matched mentions: 174932
After dropping duplicates: 82548


Unnamed: 0,pyhgvs_Genomic_Coordinate_38,pmid,snippets,points
0,chr17:g.43124027:ACT>A,10030809,Novel inherited mutations and variable express...,3
1,chr17:g.43124027:ACT>A,10030809,Novel inherited mutations and variable express...,3
2,chr17:g.43124027:ACT>A,10036974,umber of large series of young Ashkenzai Jewis...,3
3,chr17:g.43057059:T>TG,10036974,0n99 0n99 1n01 11n80 15n86 18n87 0n01 0n92 ...,3
4,chr17:g.43057062:T>TG,10036974,0n99 0n99 1n01 11n80 15n86 18n87 0n01 0n92 ...,3


In [4]:
def top_papers(mentions, pyhgvs):
    """ Return (pmid, points) tuple sorted by points against pyhgvs """
    top = mentions[mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs] \
        .groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]) \
        .agg({"points": sum}) \
        .sort_values("points", ascending=False)
    return top.reset_index()[["pmid", "points"]].values

top_papers(mentions, "chr13:g.32363367:C>G")

array([['22962691', 40],
       ['21990134', 40],
       ['24323938', 40],
       ['25146914', 40],
       ['20215541', 30],
       ['18424508', 30],
       ['30039884', 20],
       ['21990165', 20],
       ['23586058', 20],
       ['20690207', 20],
       ['20522429', 20],
       ['20507642', 20],
       ['19471317', 20],
       ['18607349', 20],
       ['18451181', 20],
       ['18273839', 20],
       ['26913838', 20],
       ['27060066', 20],
       ['28324225', 20],
       ['15026808', 20],
       ['29884841', 20],
       ['29988080', 20],
       ['23108138', 20],
       ['28339459', 15],
       ['25003164', 10],
       ['24817641', 10],
       ['24122022', 10],
       ['10464631', 10],
       ['22753008', 10],
       ['21735045', 10],
       ['21638052', 10],
       ['12145750', 10],
       ['21309043', 10],
       ['17924331', 10],
       ['17899372', 10],
       ['16792514', 10],
       ['15744044', 10],
       ['12845657', 10],
       ['21344236', 10],
       ['19332451', 2],
 

In [5]:
def top_snippets(mentions, pyhgvs, pmid):
    """ Return list of top snippets for this variant and paper by points """
    paper = mentions[(mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs) & (mentions.pmid == pmid)]
    return [snippet 
     for snippets in paper.sort_values("points", ascending=False).snippets.values 
     for snippet in snippets.split("|")][:3]

top_snippets(mentions, "chr13:g.32363367:C>G", '18424508')

['BRCA2<<< T2722R>>> is a deleterious allele that causes exon skipping.',
 'Of these, the exonic variant BRCA2 c.8162T→C in exon 18 affects a position only three nucleotides upstream of the mutation BRCA2 c.8165C→G (predicting<<< p.T2722R>>>), which is known to cause exon skipping by disrupting several ESE sites.20  Two variants, BRCA2 c.316+5G→C (IVS3+5G→C) and c.7805G→C, at the last base of exon 16, induced strong effects on splicing (figs 2A,B and 2C,D, respectively).',
 'Of these, the exonic variant BRCA2 c.8162TRC in exon 18 affects a position only three nucleotides upstream of the mutation BRCA2<<< c.8165CRG>>> (predicting p.T2722R), which is known to cause exon skipping by disrupting several ESE sites.20 Two variants, BRCA2 c.316+5GRC (IVS3+5GRC) and c.7805']

In [6]:
def top_papers_and_snippets(mentions, pyhgvs):
    return [{"pmid": str(p[0]), "points": int(p[1]), "snippets": top_snippets(mentions, pyhgvs, p[0])}
            for p in top_papers(mentions, pyhgvs)]

for pyhgvs in ["chr13:g.32363367:C>G", "chr17:g.43124027:ACT>A", "chr13:g.32340526:AT>A"]:
    print(pyhgvs)
    pprint.pprint(top_papers_and_snippets(mentions, pyhgvs)[:3])

chr13:g.32363367:C>G
[{'pmid': '22962691',
  'points': 40,
  'snippets': [', in the BRCA2 gene, besides the exon 7 mutations described '
               'above, only one mutation in exon 3, c.231T>G (p.Thr77Thr),16 '
               'and two mutations in exon 18,<<< c.8165C>G>>> (p.Thr2722Arg)28 '
               'and c.7992T>A (p.Ile2664Ile),16 have been reported to induce '
               'exon skipping by altering splicing regulatory elements.',
               'BRCA2<<< T2722R>>> is a deleterious allele that causes exon '
               'skipping.',
               'CA2 gene, besides the exon 7 mutations described above, only '
               'one mutation in exon 3, c.231T>G (p.Thr77Thr),16 and two '
               'mutations in exon 18, c.8165C>G <<<(p.Thr2722Arg>>>)28 and '
               'c.7992T>A (p.Ile2664Ile),16 have been reported to induce exon '
               'skipping by altering splicing regulatory elements.']},
 {'pmid': '21990134',
  'points': 40,
  'snippets': ['0.81  0.

## Export

In [22]:
variants = {}

remaining = mentions.pyhgvs_Genomic_Coordinate_38.unique().shape[0]
print(datetime.datetime.now())
for pyhgvs in mentions.pyhgvs_Genomic_Coordinate_38.unique():
    sys.stdout.write("Remaining: {}\r".format(remaining))
    sys.stdout.flush()
    variants[pyhgvs] = top_papers_and_snippets(mentions, pyhgvs)
    remaining -= 1

2019-02-27 22:58:23.671037
Remaining: 1000

In [27]:
lit = {
    "date": open("pubs-date.txt").read().strip(),
    "papers": articles[articles.pmid.isin(mentions.pmid)].set_index("pmid", drop=False).to_dict(orient="index"),
    "variants": variants
}

with open("literature.json", "w") as output:
    output.write(json.dumps(lit, sort_keys=True))
    
print("Exported {} variants in {} papers".format(
    len(lit["variants"].keys()), len(lit["papers"].keys())))

with open("literature.json") as f:
    lit = json.loads(f.read())
print("{} Papers and {} Variants exported".format(len(lit["papers"]), len(lit["variants"])))

Exported 9783 variants in 3615 papers
3615 Papers and 9783 Variants exported


In [28]:
pprint.pprint(lit["variants"]["chr13:g.32363367:C>G"])

[{'pmid': '22962691',
  'points': 40,
  'snippets': [', in the BRCA2 gene, besides the exon 7 mutations described '
               'above, only one mutation in exon 3, c.231T>G (p.Thr77Thr),16 '
               'and two mutations in exon 18,<<< c.8165C>G>>> (p.Thr2722Arg)28 '
               'and c.7992T>A (p.Ile2664Ile),16 have been reported to induce '
               'exon skipping by altering splicing regulatory elements.',
               'BRCA2<<< T2722R>>> is a deleterious allele that causes exon '
               'skipping.',
               'CA2 gene, besides the exon 7 mutations described above, only '
               'one mutation in exon 3, c.231T>G (p.Thr77Thr),16 and two '
               'mutations in exon 18, c.8165C>G <<<(p.Thr2722Arg>>>)28 and '
               'c.7992T>A (p.Ile2664Ile),16 have been reported to induce exon '
               'skipping by altering splicing regulatory elements.']},
 {'pmid': '21990134',
  'points': 40,
  'snippets': ['0.81  0.95  4   18  A2717S  p