# BRCA Exchange Literature Search Export

Ingest artifacts from the pipeline, wrangle, and generate literature.json

In [1]:
import os
import json
import sqlite3
import pandas as pd

os.chdir(os.path.expanduser("/home/jovyan/data/pubmunch/crawl/"))

## Ingest
Ingest artifacts from various pieces of the pipeline indexed by normalized genomic hgvs and integer pmid 

In [2]:
connection = sqlite3.connect("file:text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
articles.head()

159 articles loaded from the articles sqlite database


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5019423647,PMID19423647,,download,,Annals of oncology : official journal of the E...,0923-7534,1569-8041,9007735,2009,...,6,1143,19423647,,10.1093/annonc/mdp241,,2019-02-21T00:07:31+0000,78,21596,0_00000
1,5009126734,PMID9126734,,download,,Nature,0028-0836,0028-0836,0410462,1997,...,6627,772,9126734,,10.1038/386772a0,,2019-02-21T00:07:58+0000,22305,19931,0_00000
2,5022889855,PMID22889855,,download,,Journal of medical genetics,0022-2593,1468-6244,2985087R,2012,...,8,525,22889855,3810416.0,10.1136/jmedgenet-2012-101037,,2019-02-21T00:08:35+0000,42718,118856,0_00000
3,5012473589,PMID12473589,,download,,Clinical cancer research : an official journal...,1078-0432,1078-0432,9502500,2002,...,12,3776,12473589,,,,2019-02-21T00:08:56+0000,2919658796,72920,0_00000
4,5018285836,PMID18285836,,download,,European journal of human genetics : EJHG,1018-4813,1018-4813,9302235,2008,...,7,820,18285836,3905962.0,10.1038/ejhg.2008.13,,2019-02-21T00:09:18+0000,4427189208,113381,0_00000


In [3]:
mentions = pd.read_csv("mentions-matched.tsv", sep="\t", encoding="utf-8", dtype="str")
print("Total matched mentions: {}".format(mentions.shape[0]))
mentions.head()

Total matched mentions: 16640


Unnamed: 0,pyhgvs_Genomic_Coordinate_38,pmid,snippets,score
0,chr17:g.43092632:T>A,10373534,"1051-1863, using a Clontech transformer site-d...",3
1,chr17:g.43092632:T>A,10373534,"1051-1863, using a Clontech transformer site-...",3
2,chr17:g.43076488:C>A,10373534,A mutation in close proximity has been compile...,2
3,chr17:g.43076488:C>A,10373534,A mutation in close proximity has been compile...,2
4,chr17:g.43063903:G>T,10426999,"n of the BRCT domain, as indicated. Two of the...",2


In [4]:
mentions[mentions.pmid == "20020529"]

Unnamed: 0,pyhgvs_Genomic_Coordinate_38,pmid,snippets,score


In [5]:
print("Initial # mentions", mentions.shape[0])
pruned_mentions = mentions.drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "pmid", "snippets"])
print("After dropping duplicates of pyhgvs_Genomic_Coordinate_38+pmid+snippets: {}".format(pruned_mentions.shape[0]))

Initial # mentions 16640
After dropping duplicates of pyhgvs_Genomic_Coordinate_38+pmid+snippets: 12455


In [6]:
# Some of the snippets are multiple mentions separated by | so unpack these,
# but limit to 3 as some have as many as 168!
print("Unpacking {} of {} snippets with multiple phrases seaparated by '|'".format(
    pruned_mentions[pruned_mentions.snippets.str.contains("|", regex=False)].shape[0], pruned_mentions.shape[0]))

# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows/21032532

# Reset index so each row id is unique vs. norm_g_hgvs
df = pruned_mentions.reset_index()
# df = variant_mentions[variant_mentions.snippet.str.contains("|", regex=False)].iloc[0:100].reset_index()

# Generate a new dataframe splitting each snippet segment into its own row
# Limit to max of 3 as some of them have > 100
snippets = df.apply(lambda x: pd.Series(x.snippets.split("|")[:3]), axis=1).stack()

# To line up with the original index
snippets.index = snippets.index.droplevel(-1)

# Join back to the original dataframe replaceing the old "snippet" columnd
snippets.name = "snippets"
del df["snippets"]
exploded = df.join(snippets).drop_duplicates(
    ["pyhgvs_Genomic_Coordinate_38", "snippets"]).set_index(
    "pyhgvs_Genomic_Coordinate_38", drop=True)
print("{} individual snippets after expanding and de-duplicating snippets".format(exploded.shape[0]))

exploded.head()

Unpacking 5817 of 12455 snippets with multiple phrases seaparated by '|'
21328 individual snippets after expanding and de-duplicating snippets


Unnamed: 0_level_0,index,pmid,score,snippets
pyhgvs_Genomic_Coordinate_38,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr17:g.43092632:T>A,0,10373534,3,"1051-1863, using a Clontech transformer site-d..."
chr17:g.43092632:T>A,0,10373534,3,nd S1497T); mutants T967S and T967D were first...
chr17:g.43092632:T>A,0,10373534,3,"Mutagenic primers for S1497A, S1497T,<<< T967S..."
chr17:g.43092632:T>A,1,10373534,3,"1051-1863, using a Clontech transformer site-..."
chr17:g.43092632:T>A,1,10373534,3,and S1497T); mutants T967S and T967D were firs...


# Stats

In [7]:
print("Max mentions for a single variant: {} {}".format(
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38"]).size().idxmax(),
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38"]).size().max()))

print("Max mentions for a single paper from one variant: {} {}".format(
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]).size().idxmax(),
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]).size().max()))

print("Paper referenced from the most variants: {} {}".format(
    exploded.groupby(["pmid"]).size().idxmax(),
    exploded.groupby(["pmid"]).size().max()))

Max mentions for a single variant: chr17:g.43063903:G>T 125
Max mentions for a single paper from one variant: ('chr13:g.32394673:T>C', '22144684') 13
Paper referenced from the most variants: 16267036 6368


In [8]:
print("Variant mention stats:")
print(exploded.reset_index().groupby(["pyhgvs_Genomic_Coordinate_38"])
      ["pyhgvs_Genomic_Coordinate_38"].count().describe())

Variant mention stats:
count    5174.000000
mean        4.122149
std         5.799585
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max       125.000000
Name: pyhgvs_Genomic_Coordinate_38, dtype: float64


In [9]:
print("Paper mentions stats:")
print(exploded.reset_index().groupby(["pmid"])
      ["pmid"].count().describe())

Paper mentions stats:
count     149.000000
mean      143.140940
std       630.832976
min         1.000000
25%         7.000000
50%        20.000000
75%        61.000000
max      6368.000000
Name: pmid, dtype: float64


## Export

In [10]:
# Variants by pyhgvs_Genomic_Coordinate_38 by pmid with all snippets in a list
combined = exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"])["snippets"].apply(lambda s: s.tolist())
print("Combined {} separate snippets down to {} after grouping by pmid".format(exploded.shape[0], combined.shape[0]))

Combined 21328 separate snippets down to 8802 after grouping by pmid


In [11]:
#     "variants": {
#         k: {kk: vv[0] for kk, vv in v.unstack().transpose().iterrows()}
#         for k, v in combined.groupby("pyhgvs_Genomic_Coordinate_38")},

# import itertools
# for k, v in exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"])[["pmid", "snippets", "score"]]:
#     if "1" in v.score.values:
#         print(k)
#         print(v.score.values)


In [12]:
lit = {
    "date": open("pubs-date.txt").read().strip(),
    "papers": articles[articles.pmid.isin(pruned_mentions.pmid)].set_index("pmid", drop=False).to_dict(orient="index"),
    "variants": {
        k: {kk: vv[0] for kk, vv in v.unstack().transpose().iterrows()}
        for k, v in combined.groupby("pyhgvs_Genomic_Coordinate_38")},
}

with open("literature.json", "w") as output:
    output.write(json.dumps(lit, sort_keys=True))
    
print("Exported {} variants in {} papers".format(
    len(lit["variants"].keys()), len(lit["papers"].keys())))

Exported 5174 variants in 149 papers


In [13]:
with open("literature.json") as f:
    lit = json.loads(f.read())
print("{} Papers and {} Variants exported".format(len(lit["papers"]), len(lit["variants"])))

149 Papers and 5174 Variants exported
