# BRCA Exchange Literature Search Stats & Export

Ingest artifacts from correlate.py, generate literature.json for import into BRCA Exchange as well as various statistics.

Notes:
* pmid is coerced into a string throughout

In [2]:
import os
import json
import sqlite3
import pandas as pd

os.chdir(os.path.expanduser("/notebooks/data/pubmunch/crawl-14-11-2018/"))

## Download Stats

In [2]:
with open("download/pmids.txt") as f:
    pmids = f.read().splitlines()
print("Attempted to download {} PMIDs with BRCA in the title or abstract".format(len(pmids)))

Attempted to download 16980 PMIDs with BRCA in the title or abstract


In [3]:
download = pd.read_table("download/docStatus.tab", header=None, index_col=0, dtype=str,
                         names=["pmid", "status", "msg", "crawler", "journal", "year", "numFiles", "detail"])
print("Failed to download {}({}%) of {} papers.".format(
    download[download.status != "OK"].shape[0], 
    round(100 * download[download.status != "OK"].shape[0]/len(pmids)),
      download[download.status == "OK"].shape[0]
     ))
print("Reasons:")
print(download.status.value_counts())

Failed to download 2753(16%) of 14088 papers.
Reasons:
OK                     14088
invalidPdf               978
noCrawlerSuccess         817
httpError                302
noLicense                301
HighwirePdfNotValid      254
invalidHostname           55
pageErrorMessage          28
no_meta                    9
noOutlinkOrDoi             6
HtmlParseError             1
BeautifulSoupError         1
tooManySupplFiles          1
Name: status, dtype: int64


## Ingest
Ingest artifacts from various pieces of the pipeline indexed by normalized genomic hgvs and integer pmid 

In [4]:
connection = sqlite3.connect("file:text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
articles.head()

14044 articles loaded from the articles sqlite database


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5011585672,PMID11585672,,download,,Trends in genetics : TIG,0168-9525,0168-9525,8507085,2001,...,10,S18,11585672,,10.1016/s0168-9525(01)02451-9,https://linkinghub.elsevier.com/retrieve/pii/S...,2018-11-14T16:29:09+0000,78,5258,0_00000
1,5019688261,PMID19688261,,download,,Breast cancer research and treatment,0167-6806,1573-7217,8111104,2010,...,3,575,19688261,,10.1007/s10549-009-0501-3,,2018-11-14T16:29:16+0000,5048,100825,0_00000
2,5012228710,PMID12228710,,download,,"Science (New York, N.Y.)",0036-8075,1095-9203,404511,2002,...,5588,1837,12228710,,10.1126/science.297.5588.1837,,2018-11-14T16:29:40+0000,1436851129,148950,0_00000
3,5029369605,PMID29369605,,download,,Genetika,0016-6758,0016-6758,47354,2016,...,10,1215,29369605,,10.1134/s102279541609012x,https://link.springer.com/article/10.1134%2FS1...,2018-11-14T16:30:43+0000,4458079378,22264,0_00000
4,5022084640,PMID22084640,,download,,Therapeutic advances in medical oncology,1758-8340,1758-8359,101510808,2011,...,6,257,22084640,3210467.0,10.1177/1758834011417039,,2018-11-14T16:30:58+0000,6079002966,93229,0_00000


In [5]:
variants = pd.read_table("variants-normalized.tsv", index_col="norm_g_hgvs")
print("{} variants in the BRCA Exchange database".format(variants.shape[0]))
variants.head()

20934 variants in the BRCA Exchange database


Unnamed: 0_level_0,Chr,Pos,Ref,Alt,pyhgvs_Genomic_Coordinate_38,pyhgvs_cDNA
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NC_000013.11:g.32314943A>G,13,32314943,A,G,chr13:g.32314943:A>G,NM_000059.3:c.-764A>G
NC_000013.11:g.32315226G>A,13,32315226,G,A,chr13:g.32315226:G>A,NM_000059.3:c.-481G>A
NC_000013.11:g.32315490C>T,13,32315490,C,T,chr13:g.32315490:C>T,NM_000059.3:c.-217C>T
NC_000013.11:g.32315519C>T,13,32315519,C,T,chr13:g.32315519:C>T,NM_000059.3:c.-188C>T
NC_000013.11:g.32315532C>T,13,32315532,C,T,chr13:g.32315532:C>T,NM_000059.3:c.-175C>T


In [6]:
mentions = pd.read_table("mentions-normalized.tsv", index_col="norm_g_hgvs")
mentions.pmid = mentions.pmid.astype(str)
print("Total normalized mentions: {}".format(mentions.shape[0]))
mentions.head()

Total normalized mentions: 442198


Unnamed: 0_level_0,norm_c_hgvs,pmid,snippet
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_001145459.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_139212.3:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_139211.4:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."


## Wrangle

In [7]:
mentions = mentions.reset_index().drop_duplicates(["norm_g_hgvs", "pmid", "snippet"]).set_index("norm_g_hgvs")
print("After dropping duplicates of hgvs+pmid+snippet: {}".format(mentions.shape[0]))
print("Total unique genomic hgvs variants: {}".format(mentions.index.unique().shape[0]))
mentions.head()

After dropping duplicates of hgvs+pmid+snippet: 296420
Total unique genomic hgvs variants: 188514


Unnamed: 0_level_0,norm_c_hgvs,pmid,snippet
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_001145459.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."
NC_000004.12:g.56655997A>C,NM_001145459.1:c.4T>G,12228710,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."
NC_000013.11:g.32326569del,NM_000059.3:c.587del,29369605,Gene Gene function in the cell and involveme...


In [8]:
# Prune mentions down to only those with a variant in BRCA Excchange
variant_mentions = pd.merge(variants, mentions, left_index=True, right_index=True)
print("{} mentions of variants in BRCA Exchange".format(variant_mentions.shape[0]))
variant_mentions.head()

34737 mentions of variants in BRCA Exchange


Unnamed: 0_level_0,Chr,Pos,Ref,Alt,pyhgvs_Genomic_Coordinate_38,pyhgvs_cDNA,norm_c_hgvs,pmid,snippet
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"tions: in EPCAM, c.412C>T p.(Arg138*), and coe..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"EPCAM, c.412C>T p.(Arg138*), and coexisting mu..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,nd in-frame deletions were classified as delet...
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,and in-frame deletions were classiﬁed as dele...
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,24302565,and clinical data. Two cancer syndrome gene v...


In [9]:
# Some of the snippets are multiple mentions separated by | so unpack these,
# but limit to 3 as some have as many as 168!
print("Unpacking {} of {} snippets with multiple phrases seaparated by '|'".format(
    variant_mentions[variant_mentions.snippet.str.contains("\|")].shape[0], variant_mentions.shape[0]))

# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows/21032532

# Resest index so each row id is unique vs. norm_g_hgvs
df = variant_mentions.reset_index()
# df = variant_mentions[variant_mentions.snippet.str.contains("\|")].iloc[0:100].reset_index()

# Generate a new dataframe splitting each snippet segment into its own row
# Limit to max of 3 as some of them have > 100
snippets = df.apply(lambda x: pd.Series(x.snippet.split("|")[:3]), axis=1).stack()

# To line up with the original index
snippets.index = snippets.index.droplevel(-1)

# Join back to the original dataframe replaceing the old "snippet" columnd
snippets.name = "snippet"
del df["snippet"]
exploded = df.join(snippets).drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "snippet"]).set_index("pyhgvs_Genomic_Coordinate_38")
print("{} individual snippets after expanding and de-duplicating snippets".format(exploded.shape[0]))

exploded.head()

Unpacking 11804 of 34737 snippets with multiple phrases seaparated by '|'
48941 individual snippets after expanding and de-duplicating snippets


Unnamed: 0_level_0,norm_g_hgvs,Chr,Pos,Ref,Alt,pyhgvs_cDNA,norm_c_hgvs,pmid,snippet
pyhgvs_Genomic_Coordinate_38,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"tions: in EPCAM, c.412C>T p.(Arg138*), and coe..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"EPCAM, c.412C>T p.(Arg138*), and coexisting mu..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"in EPCAM, c.412C>T p.(Arg138*), and coexisting..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,nd in-frame deletions were classified as delet...
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,"BRCA1 p.M1I,<<< p.M1V>>>, and p.C61G and TP53 ..."


## Export

In [10]:
# Variants by pyhgvs_Genomic_Coordinate_38 by pmid with all snippets in a list
combined = exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"])["snippet"].apply(lambda s: s.tolist())
print("Combined {} separate snippets down to {} after grouping by pmid".format(exploded.shape[0], combined.shape[0]))

Combined 48941 separate snippets down to 18025 after grouping by pmid


In [11]:
articles.head()

Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5011585672,PMID11585672,,download,,Trends in genetics : TIG,0168-9525,0168-9525,8507085,2001,...,10,S18,11585672,,10.1016/s0168-9525(01)02451-9,https://linkinghub.elsevier.com/retrieve/pii/S...,2018-11-14T16:29:09+0000,78,5258,0_00000
1,5019688261,PMID19688261,,download,,Breast cancer research and treatment,0167-6806,1573-7217,8111104,2010,...,3,575,19688261,,10.1007/s10549-009-0501-3,,2018-11-14T16:29:16+0000,5048,100825,0_00000
2,5012228710,PMID12228710,,download,,"Science (New York, N.Y.)",0036-8075,1095-9203,404511,2002,...,5588,1837,12228710,,10.1126/science.297.5588.1837,,2018-11-14T16:29:40+0000,1436851129,148950,0_00000
3,5029369605,PMID29369605,,download,,Genetika,0016-6758,0016-6758,47354,2016,...,10,1215,29369605,,10.1134/s102279541609012x,https://link.springer.com/article/10.1134%2FS1...,2018-11-14T16:30:43+0000,4458079378,22264,0_00000
4,5022084640,PMID22084640,,download,,Therapeutic advances in medical oncology,1758-8340,1758-8359,101510808,2011,...,6,257,22084640,3210467.0,10.1177/1758834011417039,,2018-11-14T16:30:58+0000,6079002966,93229,0_00000


In [30]:
literature = {
    "papers": articles[articles.pmid.isin(variant_mentions.pmid)].set_index("pmid", drop=False).to_dict(orient="index"),
    "variants": {
        k: {kk: vv[0] for kk, vv in v.unstack().transpose().iterrows()}
        for k, v in combined.groupby("pyhgvs_Genomic_Coordinate_38")},
}

# with open("literature.json", "w") as output:
#     output.write(json.dumps(literature, sort_keys=True))
    
# print("Exported {} variants in {} papers".format(
#     len(literature["variants"].keys()), len(literature["papers"].keys())))

# Stats

In [13]:
print("Max mentions for a single variant: {} {}".format(
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38"]).size().idxmax(),
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38"]).size().max()))

print("Max mentions for a single paper from one variant: {} {}".format(
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]).size().idxmax(),
    exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]).size().max()))

print("Paper referenced from the most variants: {} {}".format(
    exploded.groupby(["pmid"]).size().idxmax(),
    exploded.groupby(["pmid"]).size().max()))

Max mentions for a single variant: chr17:g.43106487:A>C 1411
Max mentions for a single paper from one variant: ('chr13:g.32319082:G>A', '27490902') 19
Paper referenced from the most variants: 21990134 1633


In [14]:
print("Variant mention stats:")
print(exploded.reset_index().groupby(["pyhgvs_Genomic_Coordinate_38"])
      ["pyhgvs_Genomic_Coordinate_38"].count().describe())

Variant mention stats:
count    3754.000000
mean       13.037027
std        39.876334
min         1.000000
25%         2.000000
50%         4.000000
75%        11.000000
max      1411.000000
Name: pyhgvs_Genomic_Coordinate_38, dtype: float64


In [15]:
print("Paper mentions stats:")
print(exploded.reset_index().groupby(["pmid"])
      ["pmid"].count().describe())

Paper mentions stats:
count    2145.000000
mean       22.816317
std        58.991191
min         1.000000
25%         3.000000
50%         6.000000
75%        19.000000
max      1633.000000
Name: pmid, dtype: float64


## Failed To Correlate
Papers where pubMunch found mutations but that failed to correlate after normalization

In [16]:
mutations = pd.read_table("mutations.tsv", header=0,
                          usecols=["docId", "hgvsCoding", "mutSnippets"],
                          dtype={"docId": "str", "hgvsCoding": "str", "mutSnippets": "str"},
                         ).dropna(subset=["hgvsCoding"])
mutations = mutations.rename({"docId": "pmid"}, axis="columns")
mutations.head()

Unnamed: 0,pmid,hgvsCoding,mutSnippets
5,12228710,NM_006304.1:c.4T>G,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
6,12228710,NM_001145459.1:c.4T>G|NM_139212.3:c.4T>G|NM_13...,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
11,12228710,NM_006304.1:c.4T>G,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."
12,12228710,NM_001145459.1:c.4T>G|NM_139212.3:c.4T>G|NM_13...,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."
21,29369605,NM_000059.3:c.587G>None,Gene Gene function in the cell and involveme...


In [31]:
len(set(mutations.pmid.values))

len(set(articles.index.values))

14044

In [18]:
sorted(set(articles.index.values) - set(mutations.pmid.values))[::1000]

[0,
 1000,
 2000,
 3000,
 4000,
 5000,
 6000,
 7000,
 8000,
 9000,
 10000,
 11000,
 12000,
 13000,
 14000]

In [19]:
papers_with_no_variants = sorted(set(pmids) - set(exploded.pmid.values), reverse=True)
papers_with_no_variants[0:10]

['9988281',
 '9988226',
 '9973246',
 '9950543',
 '9950212',
 '9930365',
 '9928546',
 '9928543',
 '9927063',
 '9927062']

## Export Stats
Stats generated directly from the exported literature.json

In [3]:
with open(os.path.expanduser("/notebooks/data/pubmunch/crawl-14-11-2018/literature-05-12-2018-v1.json")) as f:
    lit = json.load(f)

In [4]:
print("{} Papers, {} Variants".format(len(lit["papers"]), len(lit["variants"])))

2227 Papers, 3754 Variants


In [15]:
list(lit["variants"].keys())[0]

'chr17:g.43092596:G>A'