# Export
Explorations using the intermediate files in correlate.py for various export formats

In [1]:
import os
import json
import sqlite3
import pandas as pd

os.chdir(os.path.expanduser("~/data/pubmunch/crawl/"))

In [569]:
connection = sqlite3.connect("file:text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
print("{} articles downloaded with BRCA in the title or abstract".format(articles.shape[0]))
articles.pmid = articles.pmid.astype(str)
articles.head()

14044 articles downloaded with BRCA in the title or abstract


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5011585672,PMID11585672,,download,,Trends in genetics : TIG,0168-9525,0168-9525,8507085,2001,...,10,S18,11585672,,10.1016/s0168-9525(01)02451-9,https://linkinghub.elsevier.com/retrieve/pii/S...,2018-11-14T16:29:09+0000,78,5258,0_00000
1,5019688261,PMID19688261,,download,,Breast cancer research and treatment,0167-6806,1573-7217,8111104,2010,...,3,575,19688261,,10.1007/s10549-009-0501-3,,2018-11-14T16:29:16+0000,5048,100825,0_00000
2,5012228710,PMID12228710,,download,,"Science (New York, N.Y.)",0036-8075,1095-9203,404511,2002,...,5588,1837,12228710,,10.1126/science.297.5588.1837,,2018-11-14T16:29:40+0000,1436851129,148950,0_00000
3,5029369605,PMID29369605,,download,,Genetika,0016-6758,0016-6758,47354,2016,...,10,1215,29369605,,10.1134/s102279541609012x,https://link.springer.com/article/10.1134%2FS1...,2018-11-14T16:30:43+0000,4458079378,22264,0_00000
4,5022084640,PMID22084640,,download,,Therapeutic advances in medical oncology,1758-8340,1758-8359,101510808,2011,...,6,257,22084640,3210467.0,10.1177/1758834011417039,,2018-11-14T16:30:58+0000,6079002966,93229,0_00000


In [570]:
variants = pd.read_table("variants-normalized.tsv", index_col="norm_g_hgvs")
print("{} variants in BRCA Exchange database".format(variants.shape[0]))
variants.head()

20934 variants in BRCA Exchange database


Unnamed: 0_level_0,Chr,Pos,Ref,Alt,pyhgvs_Genomic_Coordinate_38,pyhgvs_cDNA
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NC_000013.11:g.32314943A>G,13,32314943,A,G,chr13:g.32314943:A>G,NM_000059.3:c.-764A>G
NC_000013.11:g.32315226G>A,13,32315226,G,A,chr13:g.32315226:G>A,NM_000059.3:c.-481G>A
NC_000013.11:g.32315490C>T,13,32315490,C,T,chr13:g.32315490:C>T,NM_000059.3:c.-217C>T
NC_000013.11:g.32315519C>T,13,32315519,C,T,chr13:g.32315519:C>T,NM_000059.3:c.-188C>T
NC_000013.11:g.32315532C>T,13,32315532,C,T,chr13:g.32315532:C>T,NM_000059.3:c.-175C>T


In [571]:
# mentions = pd.read_table("mentions-normalized.tsv", index_col="norm_g_hgvs").drop_duplicates(["pmid", "snippet"])
mentions = pd.read_table("mentions-normalized.tsv", index_col="norm_g_hgvs")
mentions.pmid = mentions.pmid.astype(str)
print("{} total mentions found".format(mentions.shape[0]))
mentions.head()

442198 total mentions found


Unnamed: 0_level_0,norm_c_hgvs,pmid,snippet
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_001145459.1:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_139212.3:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000004.12:g.56655997A>C,NM_139211.4:c.4T>G,12228710,"ee-stranded β sheet with OB2β1 and OB2β2, as w..."
NC_000007.14:g.96709760A>C,NM_006304.1:c.4T>G,12228710,"ee-stranded ␤ sheet with OB2␤1 and OB2␤2, as w..."


In [572]:
# Prune mentions down to only those correlated to a variant in BRCA Excchange
variant_mentions = pd.merge(variants, mentions, left_index=True, right_index=True)
print("{} mentions of variants in BRCA Exchange".format(variant_mentions.shape[0]))
variant_mentions.head()

55657 mentions of variants in BRCA Exchange


Unnamed: 0_level_0,Chr,Pos,Ref,Alt,pyhgvs_Genomic_Coordinate_38,pyhgvs_cDNA,norm_c_hgvs,pmid,snippet
norm_g_hgvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"tions: in EPCAM, c.412C>T p.(Arg138*), and coe..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"EPCAM, c.412C>T p.(Arg138*), and coexisting mu..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"tions: in EPCAM, c.412C>T p.(Arg138*), and coe..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"EPCAM, c.412C>T p.(Arg138*), and coexisting mu..."
NC_000013.11:g.32316461A>G,13,32316461,A,G,chr13:g.32316461:A>G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,nd in-frame deletions were classified as delet...


In [573]:
# Some of the snippets are multiple mentiones separated by | so unpack these,
# but limit to 5 as some have as many as 168!
print("Unpacking {} of {} snippets with multiple phrases seaparated by '|'".format(
    variant_mentions[variant_mentions.snippet.str.contains("\|")].shape[0], variant_mentions.shape[0]))

Unpacking 18909 of 55657 snippets with multiple phrases seaparated by '|'


In [574]:
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows/21032532

# Resest index so each row id is unique vs. norm_g_hgvs
df = variant_mentions.reset_index()
# df = variant_mentions[variant_mentions.snippet.str.contains("\|")].iloc[0:100].reset_index()

# Generate a new dataframe splitting each snippet segment into its own row
# Limit to max of 3 as some of them have > 100
snippets = df.apply(lambda x: pd.Series(x.snippet.split("|")[:3]), axis=1).stack()

# To line up with the original index
snippets.index = snippets.index.droplevel(-1)

# Join back to the original dataframe replaceing the old "snippet" columnd
snippets.name = "snippet"
del df["snippet"]
exploded = df.join(snippets).drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "snippet"]).set_index("pyhgvs_Genomic_Coordinate_38")
print("{} after expanding and de-duplicating snippets".format(exploded.shape[0]))

48941 after expanding and de-duplicating snippets


In [575]:
exploded.head()

Unnamed: 0_level_0,norm_g_hgvs,Chr,Pos,Ref,Alt,pyhgvs_cDNA,norm_c_hgvs,pmid,snippet
pyhgvs_Genomic_Coordinate_38,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"tions: in EPCAM, c.412C>T p.(Arg138*), and coe..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"EPCAM, c.412C>T p.(Arg138*), and coexisting mu..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,30103829,"in EPCAM, c.412C>T p.(Arg138*), and coexisting..."
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,nd in-frame deletions were classified as delet...
chr13:g.32316461:A>G,NC_000013.11:g.32316461A>G,13,32316461,A,G,NM_000059.3:c.1A>G,NM_000059.3:c.1A>G,22006311,"BRCA1 p.M1I,<<< p.M1V>>>, and p.C61G and TP53 ..."


In [292]:
# # Variants by pyhgvs_Genomic_Coordinate_38 as array of mentions
# variant_mentions_dict = {
#     k: v.to_dict(orient="records")
#     for k, v in variant_mentions.reset_index(level=0)
#                 .set_index("pyhgvs_Genomic_Coordinate_38")
#                 .groupby(["pyhgvs_Genomic_Coordinate_38"])}

#### Variants by pyhgvs_Genomic_Coordinate_38 by pmid with all snippets in a list

In [576]:
combined = exploded.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"])["snippet"].apply(lambda s: s.tolist())

In [577]:
combined["chr13:g.32316470:G>T"]

pmid
17453335    [Two novel BRCA2 germline mutations <<<(G4X>>>...
24010542    [_5797delTA  11B  c.4409_4410delTA c.4415_4418...
24312913    [G, 461delTC [41]  4817A>G, 8477delAGA [41]  —...
27225819    [  95.96%   3.239   447.681   98.46%   BRCA2+ ...
28751759    [65Leu) VUS, which was also present in this sa...
Name: snippet, dtype: object

In [581]:
literature = {
    "papers": articles[articles.pmid.isin(variant_mentions.pmid)].set_index("pmid", drop=False).to_dict(orient="index"),
    "variants": {
        k: {kk: vv[0] for kk, vv in v.unstack().transpose().iterrows()}
        for k, v in combined.groupby("pyhgvs_Genomic_Coordinate_38")},
}
with open("literature-05-12-2018-v1.json", "w") as output:
    output.write(json.dumps(literature))
    
literature["variants"]["chr13:g.32316470:G>T"]

{'17453335': ['Two novel BRCA2 germline mutations <<<(G4X>>> and 3783del10) are reported here for the first time.',
  'Each of the five BRCA2 mutations is observed only once; three are frameshift (3783del10, 4637delTA, 5950delCT), one is nonsense <<<(G4X>>>), and one is a single base substitution that results in aberrant splicing (7235G\xa0>\xa0A, skipping of exons 12–13) [15].',
  'h.gov/ Intramural_research/Lab_transfer/Bic/Member/ BRCA1_mutation_database.html). The two novel BRCA2 mutations described here for the first time are<<< G4X>>> and 3783del10, located in exons 2 and 11, respectively.',
  'Each of the five BRCA2 mutations is observed only once; three are frameshift (3783del10, 4637delTA, 5950delCT), one is nonsense <<<(G4X>>>), and one is a single base substitution that results in aberrant splicing (7235G > A, skipping of exons 12–13) [15].',
  'The two novel BRCA2 mutations described here for the first time are<<< G4X>>> and 3783del10, located in exons 2 and 11, respectivel