# BRCA Literature Crawler Stats

In [11]:
import os
import json
import sqlite3
import pandas as pd

os.chdir(os.path.expanduser("~/data/pubmunch/crawl-14-11-2018/"))

## Download

In [2]:
with open("download/pmids.txt") as f:
    pmids = f.readlines()
print("Started with {} PMIDs with BRCA in the title or abstract".format(len(pmids)))

Started with 16980 PMIDs with BRCA in the title or abstract


In [4]:
download = pd.read_table("download/docStatus.tab", header=None, index_col=0,
                         names=["pmid", "status", "msg", "crawler", "journal", "year", "numFiles", "detail"])
download.head()

Unnamed: 0_level_0,status,msg,crawler,journal,year,numFiles,detail
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11585672,OK,,elsevier-api,Trends in genetics : TIG,2001.0,2,
19688261,OK,,springer,Breast cancer research and treatment,2010.0,3,
12228710,OK,main.html contains html tag,highwire,"Science (New York, N.Y.)",2002.0,4,
29369605,OK,,springer,Genetika,2016.0,3,
22084640,OK,,pmc,Therapeutic advances in medical oncology,2011.0,3,


In [6]:
print("Download status:")
download["status"].value_counts()

Download status:


OK                     14088
invalidPdf               978
noCrawlerSuccess         817
httpError                302
noLicense                301
HighwirePdfNotValid      254
invalidHostname           55
pageErrorMessage          28
no_meta                    9
noOutlinkOrDoi             6
HtmlParseError             1
BeautifulSoupError         1
tooManySupplFiles          1
Name: status, dtype: int64

In [8]:
print("Publisher stats:")
download.groupby(["status", "crawler"]).size()

Publisher stats:


status  crawler     
OK      elsevier        2174
        elsevier-api     885
        empty              7
        generic         2885
        highwire        1941
        lww              208
        nejm              66
        npg             1309
        pmc             2437
        silverchair        1
        springer        1673
        tandf            502
dtype: int64

In [12]:
lit = json.load(open("literature-05-12-2018-v1.json"))

In [14]:
print("{} papers and {} variants".format(len(lit["papers"]), len(lit["variants"])))

2227 papers and 3754 variants


Unnamed: 0,15918047,16518693,16267036,23704879,14614327,14647443,25415331,15026808,29176636,12491487,...,12792649,17471025,10389980,23091540,16455195,22438049,23787919,9816013,15077185,12438698
chr13:g.32316461:A>G,,,,,,,,,,,...,,,,,,,,,,
chr13:g.32316462:T>A,,,[ IVS19ins1 (P) IVS1del12 (P) IVS2 + 1G>A (U) ...,,,,,,,,...,,,,,,,,,,
chr13:g.32316462:T>C,,,,,,,,,,,...,,,[enocarcinomas. Two mutations were detected in...,,,,,,,
chr13:g.32316462:T>G,,,[S12 + 9C>T (U) IVS12-11C>T (U) IVS12-19C>G (U...,,,,,,,,...,,,,,,,,,,
chr13:g.32316463:G>A,,,[ (U) L246Vc (U) L512F (U) L574L (P) L596V (U)...,,,,,,,,...,,,,,,,,,,


In [18]:
print("Top 10 variants most pubmed hits:")
pd.DataFrame.from_dict(lit["variants"], orient="index").T.count().sort_values(ascending=False)[0:10]

Top 10 variants with the most mentions:


chr17:g.43106487:A>C    410
chr13:g.32398489:A>T    150
chr17:g.43092919:G>A    148
chr17:g.43071077:T>C    144
chr17:g.43063903:G>T    141
chr17:g.43051071:A>C    140
chr17:g.43092418:T>C    133
chr13:g.32332592:A>C    131
chr17:g.43091983:T>C    126
chr17:g.43094464:T>C    124
dtype: int64

# LOVD Truth Set

In [45]:
lovd = pd.read_table(os.path.expanduser("~/pubMunch-BRCA/tests/lovd-normalized.tsv"), dtype=str)

In [53]:
print("Papers in LOVD truth set:", len(set(lovd["pmid"].values)))
common = set(lovd["pmid"].values).intersection(set(lit["papers"]))
print("Papers in both LOVD truth set and literature.json:", len(common))

Papers in LOVD truth set: 175
Papers in both LOVD truth set and literature.json: 119


In [54]:
with open("lovd")

{'10373534',
 '10426999',
 '10551859',
 '10638982',
 '10699917',
 '10811118',
 '10951344',
 '10969800',
 '11137998',
 '11157798',
 '11185744',
 '11239456',
 '11301010',
 '11320250',
 '11389159',
 '11504767',
 '11526114',
 '11573086',
 '11916966',
 '12034536',
 '12142080',
 '12145750',
 '12215251',
 '12354784',
 '12400015',
 '12427738',
 '12442171',
 '12457999',
 '12474142',
 '12496476',
 '12527904',
 '12531920',
 '12624152',
 '12759930',
 '12915465',
 '14513821',
 '14534301',
 '14576432',
 '14647443',
 '14729053',
 '14976165',
 '15001988',
 '15125843',
 '15133502',
 '15133503',
 '15172985',
 '15235020',
 '15290653',
 '15317758',
 '15345110',
 '15350310',
 '15353005',
 '15609993',
 '15689452',
 '15695382',
 '15726418',
 '15743496',
 '15876480',
 '15923272',
 '16014699',
 '16101277',
 '16168123',
 '16267036',
 '16280041',
 '16403807',
 '16489001',
 '16518693',
 '16760288',
 '16792514',
 '17005433',
 '17063491',
 '17308087',
 '17341484',
 '17493881',
 '17549625',
 '17899372',
 '17902052',