In [1]:
import json
import requests
import pandas as pd
import numpy as np
import networkx as nx
import nxpd
import matplotlib.pyplot as plt
import matplotlib
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import dataframe
import graphframes


from src.data_reader import DataReader, RetractionFinder
from src.to_gexf import to_gexf
from src.get_redacted import get_paper, load_redacted, get_doi, gen_retracted


if 'written' in globals(): del written

In [None]:
reader = DataReader('data/s2-corpus-00')

In [None]:
%pdb
reader.write(100000, dynamic=True) if not 'written' in globals() else None
written = True

In [None]:
edges = spark.createDataFrame(reader.read_edges(), schema=StructType(
[
    StructField(name='src', dataType=StringType()),
    StructField(name='dst', dataType=StringType()),
    StructField(name='year', dataType=StringType())
]
)).withColumn('id', F.monotonically_increasing_id())


nodes = spark.createDataFrame(reader.read_nodes(), schema=StructType(
[
    StructField(name='id', dataType=StringType()),
    StructField(name='year', dataType=StringType()),
    StructField(name='authors', dataType=StringType())
]
))


In [None]:
g = graphframes.GraphFrame(e=edges, v=nodes)
to_gexf(g, 'data/graph.gexf', dynamic=True)

In [None]:
page_rank = g.pageRank(maxIter=5, resetProbability=0.9)

In [None]:
page_rank.edges.show()

In [None]:
years = g.edges.toPandas()['year']

In [None]:
e = g.edges.toPandas().iloc[:, [0,1]]
e.columns = ['source', 'target']
G = nx.DiGraph(e)

In [None]:
nxpd.draw(G)

In [None]:
years.describe()


In [None]:
df = pd.read_json(path_or_buf='data/retracted', lines=-1)
df = df[df.error.isnull()]
df.drop('error', axis=1, inplace=True)

In [None]:
df.references

In [None]:
df.inCitations[3692]

In [None]:
with open('data/s2-corpus-00', 'r') as f:
    for i,line in enumerate(f):
        if 'v4' in json.loads(line)['pmid']:
            print(json.loads(line)['pmid'])

In [None]:
%pdb
for itm in find_retracted('data/s2-corpus-00', pmid=pmids):
    print(itm['title'])

In [3]:
pmr = load_redacted()
pmids=set(pmr['Db'].apply(lambda x: str(x) + 'v1'))
dois = set(pmr['Description'].apply(get_doi))

In [4]:
pmids=set(pmr['Db'].apply(lambda x: str(x)))
dois = set(pmr['Description'].apply(get_doi))

finder = RetractionFinder(pmids=pmids)
finder.find('data/s2-corpus-00')
len(finder.found)

Avg Spd: 57218 lines/sec
Avg Spd: 58679 lines/sec
Avg Spd: 58869 lines/sec
Avg Spd: 58071 lines/sec
Avg Spd: 58020 lines/sec
Avg Spd: 58573 lines/sec
Avg Spd: 58670 lines/sec
Avg Spd: 58710 lines/sec
Avg Spd: 58869 lines/sec
Avg Spd: 58988 lines/sec
Avg Spd: 59033 lines/sec
Avg Spd: 59405 lines/sec
Avg Spd: 59449 lines/sec
Avg Spd: 59352 lines/sec
Avg Spd: 59341 lines/sec
Avg Spd: 59368 lines/sec
Avg Spd: 59385 lines/sec
Avg Spd: 59386 lines/sec
Avg Spd: 59352 lines/sec
Avg Spd: 59297 lines/sec
Avg Spd: 59300 lines/sec
Avg Spd: 59406 lines/sec
Avg Spd: 59397 lines/sec
Avg Spd: 59528 lines/sec
Avg Spd: 59499 lines/sec
Avg Spd: 59559 lines/sec
Avg Spd: 59602 lines/sec
Avg Spd: 59713 lines/sec
Avg Spd: 59668 lines/sec
Avg Spd: 59727 lines/sec
Avg Spd: 59697 lines/sec
Avg Spd: 59699 lines/sec
Avg Spd: 59675 lines/sec
Avg Spd: 59711 lines/sec
Avg Spd: 59641 lines/sec
Avg Spd: 59652 lines/sec
Avg Spd: 59664 lines/sec
Avg Spd: 59703 lines/sec
Avg Spd: 59717 lines/sec
Avg Spd: 59774 lines/sec


188

In [6]:
%pdb
finder2 = RetractionFinder(pmids=pmids, dois=dois)
finder2.find('s3://alexklein/capstone/data/s2-corpus-00')
len(finder.found)

Automatic pdb calling has been turned OFF


FileNotFoundError: [Errno 2] No such file or directory: 's3://alexklein/capstone/data/s2-corpus-00'

In [None]:
pmr[pmr['Db'] == 19818140]

In [9]:
24636669

24636669

In [None]:
27613500

In [None]:
'10.1186/1746-1596-9-59'
'10.1186/s13000-016-0534-y'

In [None]:
import boto3
import time
from src.data_reader import DataReader, RetractionFinder
from src.get_redacted import get_paper, load_redacted, get_doi, gen_retracted

pmr = load_redacted()
pmids=set(pmr['Db'].apply(lambda x: str(x)))
dois = set(pmr['Description'].apply(get_doi))

s3 = boto3.client('s3')
s3_finder = RetractionFinder(pmids=pmids, dois=dois)
for i in range(40):
    t = time.process_time()
    corpus = s3.get_object(
        Bucket='alexklein', 
        Key=f'capstone/data/s2-corpus-{i if i >= 10 else "0" + str(i)}')
    s3_finder.search_stream(corpus['Body'])
    with open('data/retracted_articles', 'w') as f:
        for article in s3_finder.found:
            f.writelines(json.dumps(article))
            f.writelines('\n')
    print(f's2-corpus-{i if i >= 10 else "0" + str(i)} completed in {(time.process_time()-t):0.0f} s.')


Avg Spd: 8485 lines/sec
Avg Spd: 8391 lines/sec
Avg Spd: 8410 lines/sec
Avg Spd: 8460 lines/sec
Avg Spd: 8584 lines/sec
Avg Spd: 8700 lines/sec
Avg Spd: 8715 lines/sec
Avg Spd: 8733 lines/sec
Avg Spd: 8764 lines/sec
Avg Spd: 8825 lines/sec
Avg Spd: 8782 lines/sec
Avg Spd: 8801 lines/sec
Avg Spd: 8813 lines/sec
Avg Spd: 8858 lines/sec
Avg Spd: 8842 lines/sec
Avg Spd: 8817 lines/sec
Avg Spd: 8776 lines/sec
Avg Spd: 8762 lines/sec
Avg Spd: 8727 lines/sec
Avg Spd: 8685 lines/sec
Avg Spd: 8680 lines/sec
Avg Spd: 8693 lines/sec
Avg Spd: 8676 lines/sec
Avg Spd: 8658 lines/sec
Avg Spd: 8646 lines/sec
Avg Spd: 8638 lines/sec
Avg Spd: 8616 lines/sec
Avg Spd: 8601 lines/sec
Avg Spd: 8621 lines/sec
Avg Spd: 8630 lines/sec
Avg Spd: 8644 lines/sec
Avg Spd: 8638 lines/sec
Avg Spd: 8631 lines/sec
Avg Spd: 8631 lines/sec
Avg Spd: 8623 lines/sec
Avg Spd: 8606 lines/sec
Avg Spd: 8597 lines/sec
Avg Spd: 8598 lines/sec
Avg Spd: 8602 lines/sec
Avg Spd: 8613 lines/sec
Avg Spd: 8632 lines/sec
Avg Spd: 8637 li

In [37]:
with open('data/retracted_articles', 'r') as f:
    for line in f:
        j = json.loads(line)

In [41]:
pd.read_json('data/retracted_articles', lines=-1)

Unnamed: 0,authors,doi,doiUrl,entities,id,inCitations,journalName,journalPages,journalVolume,outCitations,paperAbstract,pdfUrls,pmid,s2PdfUrl,s2Url,sources,title,venue,year
0,"[{'name': 'Melissa Hudson', 'ids': ['4649121']}]",10.1002/pbc.20952,https://doi.org/10.1002/pbc.20952,[Adriamycin-Bleomycin-Vincristine-Etoposide Re...,30639ffe9b5fd097026ff6c0629d722fa5d01e6c,[],Pediatric blood & cancer,182,62 1,"[40fa54ca52d769bf674b71cd28286dd05d54143c, fce...",Response: Dr. Zubizarreta’s and Dr. Raslawski’...,[],25564684v1,,https://semanticscholar.org/paper/30639ffe9b5f...,[Medline],Retracted: 'Response: Early-stage Hodgkin lymp...,Pediatric blood & cancer,2015
1,"[{'name': 'E T Poehlman', 'ids': ['6495731']},...",,,"[Cholesterol, Exercise, Fibrinogen, Growth Fac...",984655aa0ea9f056dfc1da189e6b5499c11651d1,"[2ea13e7642fce23c76de4c86e9fab6e6ee1dad9b, 6e0...",European journal of clinical investigation,322-6,27 4,[],We examined the effects of the menopause trans...,[],9134381v1,,https://semanticscholar.org/paper/984655aa0ea9...,[Medline],"Menopause-associated changes in plasma lipids,...",European journal of clinical investigation,1997
2,"[{'name': 'Akiko Uehara', 'ids': ['39588834']}...",10.4049/jimmunol.1090011,https://doi.org/10.4049/jimmunol.1090011,"[Agonist, Chemotactic Factors, Cleaved cell, C...",1c41749926d0a3ef9d71bc9012c2d4bc4bcbc9a3,"[9e1f840785f13d61983b4d5fcf4c4111f85b6652, 148...",Journal of immunology,4594-603,169 8,"[ebb6b54944c07d263d223ff398af116095e0457e, 9ee...","Proteinase 3 (PR3), a 29-kDa serine proteinase...",[http://www.jimmunol.org/content/jimmunol/169/...,12370398v1,http://pdfs.semanticscholar.org/6179/34443ba74...,https://semanticscholar.org/paper/1c41749926d0...,[Medline],Activation of human oral epithelial cells by n...,Journal of immunology,2002
3,"[{'name': 'A Murat Kaynar', 'ids': ['2399016']...",10.1038/nm0910-967,https://doi.org/10.1038/nm0910-967,"[Adipose tissue, Brown Fat, Cystic Fibrosis, F...",9b8936c491a60eb4493acaea4f98dbb2f647ce54,"[2cf85a52c6926f8166d1d7a66e070ca0b7ae9f53, 9db...",Nature Medicine,967-969,16,[],nature medicine volume 16 | number 9 | septemb...,[],20823877v1,,https://semanticscholar.org/paper/9b8936c491a6...,[Medline],NET loss of air in cystic fibrosis,Nature Medicine,2010
4,"[{'name': 'Peter F Thall', 'ids': ['3005867']}...",,,"[Acute leukemia, Leukemia, Myelocytic, Acute, ...",0dc058df96635cc998f86b9afce6a751ee08fd0d,[],Clinical advances in hematology & oncology : H&O,943-8,3 12,[],This paper addresses several scientific and et...,[],16555436v1,,https://semanticscholar.org/paper/0dc058df9663...,[Medline],Some ethical issues in phase II trials in acut...,Clinical advances in hematology & oncology : H&O,2005
5,"[{'name': 'Hang Lin', 'ids': ['2128151']}, {'n...",10.1155/2014/206062,https://doi.org/10.1155/2014/206062,"[Compressive Strength, Coulomb, Equilibrium, F...",45b100c50a1b55fc91c6b3e1690bcb4976322bcf,[d2283ce3b7c0d7f52e28f1a2a2240b7dc5c1ecef],,21 - 33,2014,[],"In slope stability analysis, the limit equilib...",[http://ftp.ncbi.nlm.nih.gov/pub/pmc/a1/45/TSW...,25147838,http://pdfs.semanticscholar.org/45b1/00c50a1b5...,https://semanticscholar.org/paper/45b100c50a1b...,[Medline],Slope Stability Analysis Using Limit Equilibri...,TheScientificWorldJournal,2014
6,"[{'name': 'Hyo-Jeong Lee', 'ids': ['1956269']}...",10.1186/1756-9966-28-102,https://doi.org/10.1186/1756-9966-28-102,"[Acupuncture procedure, Addictive Behavior, Ad...",22bd27658a227ecbb1a0f216f004fed80a885e0f,"[08dda7b0cbfe0a5bed01d1e440252b65aa8f282f, 3f6...",Journal of Experimental & Clinical Cancer Rese...,102 - 102,28,"[276d8bb2a995bd4b0a9b1f50c25008813a3d97b4, a6d...",BACKGROUND\nOpioid analgesics are generally us...,[http://download-redirector.springer.com/redir...,19818140,http://pdfs.semanticscholar.org/22bd/27658a227...,https://semanticscholar.org/paper/22bd27658a22...,[Medline],Substance P and beta-endorphin mediate electro...,Journal of experimental & clinical cancer rese...,2009
7,"[{'name': 'YONG HE', 'ids': ['39624717']}, {'n...",,,"[Hemorrhage, Patients, Postoperative Complicat...",a051ee078322c5e2cd93fa4e0a9e9a243fbc8b96,"[a68c4e2477259332fbb0d6c14ca89f221dbb6fd2, 27a...",,873 - 876,7,"[506d2bbef3b6be25eda8b8139dbd205e6629bffd, 0b1...","At present, there is increasing interest in su...",[https://pdfs.semanticscholar.org/a051/ee07832...,24669243,http://pdfs.semanticscholar.org/e537/acecf348c...,https://semanticscholar.org/paper/a051ee078322...,[Medline],Evaluation of a robot-assisted video-assisted ...,Experimental and therapeutic medicine,2014
8,"[{'name': 'Chang Xu', 'ids': ['1712753']}, {'n...",10.2147/OTT.S118346,https://doi.org/10.2147/OTT.S118346,"[Biological Markers, Biomarkers, Tumor, Circul...",fc7d157966a3ba658c15bcb308de12207eaaf1a4,[],,1933 - 1939,10,"[db5aa062f80bde4eadd299953b79330eceb9dd63, 22f...",OBJECTIVES\nPrevious studies used enumerated c...,[],5384730,http://pdfs.semanticscholar.org/fc7d/157966a3b...,https://semanticscholar.org/paper/fc7d157966a3...,[Medline],FAM172A expression in circulating tumor cells ...,OncoTargets and therapy,2017
9,"[{'name': 'Jinming Huang', 'ids': ['8192535']}...",10.1007/s13277-014-2748-5,https://doi.org/10.1007/s13277-014-2748-5,"[Antineoplastic Agents, Apoptosis, Castration,...",09fc6682c6ff66b3c11aa0fd275c337c04727b4b,"[dabc3c8815de76bfa2799a9667259aaa98d5c498, 893...",Tumor Biology,1589-1594,36,"[7b3ce329eafe9a7336f9372bbfc1ac02cb8dc8d4, 383...",Cancerous inhibitor of protein phosphatase 2A ...,[],25377160v1,,https://semanticscholar.org/paper/09fc6682c6ff...,[Medline],RETRACTED ARTICLE: Knockdown of cancerous inhi...,Tumor Biology,2014
