In [1]:
import json
import requests
import pandas as pd
import numpy as np
import networkx as nx
import nxpd
import matplotlib.pyplot as plt
import matplotlib
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import dataframe
import graphframes


from src.data_reader import DataReader, RetractionFinder
from src.to_gexf import to_gexf
from src.get_redacted import get_paper, load_redacted, get_doi, gen_retracted


if 'written' in globals(): del written

In [None]:
reader = DataReader('data/s2-corpus-00')

In [None]:
%pdb
reader.write(100000, dynamic=True) if not 'written' in globals() else None
written = True

In [None]:
edges = spark.createDataFrame(reader.read_edges(), schema=StructType(
[
    StructField(name='src', dataType=StringType()),
    StructField(name='dst', dataType=StringType()),
    StructField(name='year', dataType=StringType())
]
)).withColumn('id', F.monotonically_increasing_id())


nodes = spark.createDataFrame(reader.read_nodes(), schema=StructType(
[
    StructField(name='id', dataType=StringType()),
    StructField(name='year', dataType=StringType()),
    StructField(name='authors', dataType=StringType())
]
))


In [None]:
g = graphframes.GraphFrame(e=edges, v=nodes)
to_gexf(g, 'data/graph.gexf', dynamic=True)

In [None]:
page_rank = g.pageRank(maxIter=5, resetProbability=0.9)

In [None]:
page_rank.edges.show()

In [None]:
years = g.edges.toPandas()['year']

In [None]:
e = g.edges.toPandas().iloc[:, [0,1]]
e.columns = ['source', 'target']
G = nx.DiGraph(e)

In [None]:
nxpd.draw(G)

In [None]:
years.describe()


In [None]:
df = pd.read_json(path_or_buf='data/retracted', lines=-1)
df = df[df.error.isnull()]
df.drop('error', axis=1, inplace=True)

In [None]:
df.references

In [None]:
df.inCitations[3692]

In [None]:
with open('data/s2-corpus-00', 'r') as f:
    for i,line in enumerate(f):
        if 'v4' in json.loads(line)['pmid']:
            print(json.loads(line)['pmid'])

In [None]:
%pdb
for itm in find_retracted('data/s2-corpus-00', pmid=pmids):
    print(itm['title'])

In [3]:
pmr = load_redacted()
pmids=set(pmr['Db'].apply(lambda x: str(x) + 'v1'))
dois = set(pmr['Description'].apply(get_doi))

In [4]:
pmids=set(pmr['Db'].apply(lambda x: str(x)))
dois = set(pmr['Description'].apply(get_doi))

finder = RetractionFinder(pmids=pmids)
finder.find('data/s2-corpus-00')
len(finder.found)

Avg Spd: 57218 lines/sec
Avg Spd: 58679 lines/sec
Avg Spd: 58869 lines/sec
Avg Spd: 58071 lines/sec
Avg Spd: 58020 lines/sec
Avg Spd: 58573 lines/sec
Avg Spd: 58670 lines/sec
Avg Spd: 58710 lines/sec
Avg Spd: 58869 lines/sec
Avg Spd: 58988 lines/sec
Avg Spd: 59033 lines/sec
Avg Spd: 59405 lines/sec
Avg Spd: 59449 lines/sec
Avg Spd: 59352 lines/sec
Avg Spd: 59341 lines/sec
Avg Spd: 59368 lines/sec
Avg Spd: 59385 lines/sec
Avg Spd: 59386 lines/sec
Avg Spd: 59352 lines/sec
Avg Spd: 59297 lines/sec
Avg Spd: 59300 lines/sec
Avg Spd: 59406 lines/sec
Avg Spd: 59397 lines/sec
Avg Spd: 59528 lines/sec
Avg Spd: 59499 lines/sec
Avg Spd: 59559 lines/sec
Avg Spd: 59602 lines/sec
Avg Spd: 59713 lines/sec
Avg Spd: 59668 lines/sec
Avg Spd: 59727 lines/sec
Avg Spd: 59697 lines/sec
Avg Spd: 59699 lines/sec
Avg Spd: 59675 lines/sec
Avg Spd: 59711 lines/sec
Avg Spd: 59641 lines/sec
Avg Spd: 59652 lines/sec
Avg Spd: 59664 lines/sec
Avg Spd: 59703 lines/sec
Avg Spd: 59717 lines/sec
Avg Spd: 59774 lines/sec


188

In [6]:
%pdb
finder2 = RetractionFinder(pmids=pmids, dois=dois)
finder2.find('s3://alexklein/capstone/data/s2-corpus-00')
len(finder.found)

Automatic pdb calling has been turned OFF


FileNotFoundError: [Errno 2] No such file or directory: 's3://alexklein/capstone/data/s2-corpus-00'

In [None]:
pmr[pmr['Db'] == 19818140]

In [9]:
24636669

24636669

In [None]:
27613500

In [None]:
'10.1186/1746-1596-9-59'
'10.1186/s13000-016-0534-y'

In [79]:
import boto3
import time
from src.data_reader import DataReader, RetractionFinder
from src.get_redacted import get_paper, load_redacted, get_doi, gen_retracted

pmr = load_redacted()
pmids=set(pmr['Db'].apply(lambda x: str(x)))
dois = set(pmr['Description'].apply(get_doi))

s3 = boto3.client('s3')
s3_finder = RetractionFinder(pmids=pmids, dois=dois)
tot_lines = 0
for i in range(40):
    t = time.process_time()
    corpus = s3.get_object(
        Bucket='alexklein', 
        Key=f'capstone/data/s2-corpus-{i if i >= 10 else "0" + str(i)}')
    s3_finder.search_stream(corpus['Body'])
    with open('data/retracted_articles', 'w') as f:
        for article in s3_finder.found:
            f.writelines(json.dumps(article))
            f.writelines('\n')
    
    print(f's2-corpus-{i if i >= 10 else "0" + str(i)} completed in {(time.process_time()-t):0.0f} s.')
    print(f'{len(s3_finder.found) - tot_lines} new lines found. {len(s3_finder.found)} lines total.\n')
    tot_lines = len(s3_finder.found)


s2-corpus-00 completed in 103 s.
4883 new lines found. 4883 lines total.

s2-corpus-01 completed in 97 s.
4676 new lines found. 9559 lines total.

s2-corpus-02 completed in 114 s.
4707 new lines found. 14266 lines total.

s2-corpus-03 completed in 105 s.
4601 new lines found. 18867 lines total.

s2-corpus-04 completed in 111 s.
4594 new lines found. 23461 lines total.

s2-corpus-05 completed in 110 s.
4668 new lines found. 28129 lines total.

s2-corpus-06 completed in 109 s.
4811 new lines found. 32940 lines total.

s2-corpus-07 completed in 121 s.
4670 new lines found. 37610 lines total.

s2-corpus-08 completed in 110 s.
4631 new lines found. 42241 lines total.

s2-corpus-09 completed in 115 s.
4648 new lines found. 46889 lines total.

s2-corpus-10 completed in 113 s.
4746 new lines found. 51635 lines total.

s2-corpus-11 completed in 126 s.
4680 new lines found. 56315 lines total.

s2-corpus-12 completed in 117 s.
4726 new lines found. 61041 lines total.

s2-corpus-13 completed in 12

In [49]:
with open('data/retracted_articles', 'r') as f:
    for line in f:
        j = json.loads(line)

In [80]:
with open('data/retracted_articles') as f:
    print(sum(1 for _ in f))

184716


Unnamed: 0,authors,doi,doiUrl,entities,id,inCitations,journalName,journalPages,journalVolume,outCitations,paperAbstract,pdfUrls,pmid,s2PdfUrl,s2Url,sources,title,venue,year
16,"[{'name': 'J Bakhach', 'ids': ['3991884']}, {'...",,,"[Anatomic structures, Body cavities, Bone Tiss...",eb5749074bb4a9979d9051ed93b345bb491d126c,"[55b4ccec284f7bf848907338cbee13368ecb78fb, a94...",Annales de chirurgie plastique et esthetique,705-14,50 6,[],The authors report a new technique of pulley p...,[],16165265v1,,https://semanticscholar.org/paper/eb5749074bb4...,[Medline],"[The Omega ""Omega"" pulley plasty. A new techni...",Annales de chirurgie plastique et esthetique,2005.0
19,"[{'name': 'Marco Carotenuto', 'ids': ['4056797...",10.1016/j.mehy.2013.04.023,https://doi.org/10.1016/j.mehy.2013.04.023,"[Airway Obstruction, Alveolar, Apnea, Blood Co...",224495f9340c055c5ae84f6f360ef26260478158,"[91259349282db63a0e0b354d4d1bb54065f78bfc, a11...",Medical hypotheses,306-8,81 2,"[f76807edcef589bf02a691554386dfc0afa3dc38, d9e...",Sleep-related breathing disorders (SRBD) are d...,[],23660129v1,,https://semanticscholar.org/paper/224495f9340c...,[Medline],Positional abnormalities during sleep in child...,Medical hypotheses,2013.0
25,"[{'name': 'Debjyoti Karmakar', 'ids': ['254409...",10.4103/0976-7800.227258,https://doi.org/10.4103/0976-7800.227258,"[Pharmacology, Physiological Sexual Disorders,...",d34ed84f217753f153acf3deb57685814711c6a7,"[c0bbe06c9bb67e0ea738f6e30f6318a4869243e2, 586...",,104 - 110,5,[e5873578ae3dcf305513e4b84ce22a0ca7d13d59],[This retracts the article on p. 104 in vol. 5...,[],5879851,http://pdfs.semanticscholar.org/d34e/d84f21775...,https://semanticscholar.org/paper/d34ed84f2177...,[Medline],Retraction: Current Concepts in Voiding Dysfun...,Journal of mid-life health,2014.0
40,"[{'name': 'Mark Gahegan', 'ids': ['1801583']},...",10.1007/978-3-319-11593-1_10,https://doi.org/10.1007/978-3-319-11593-1_10,"[Data model, Delta-sigma modulation, Framing (...",1e5369a42c676d778ed40540ef6bdad15eb98a48,"[3933b503688c8a93a36b80afd11793ba56a81aa8, 8cb...",,142-158,,"[36f00cb1e5347f783450f2339858872a95a1c867, 502...",Given the growth in geographical data producti...,[https://wiki.auckland.ac.nz/download/attachme...,,http://pdfs.semanticscholar.org/1e53/69a42c676...,https://semanticscholar.org/paper/1e5369a42c67...,[DBLP],Re-Envisioning Data Description Using Peirce's...,GIScience,2014.0
41,"[{'name': 'E A Ameh', 'ids': ['3655305']}, {'n...",,,"[Atrophic, Cryptorchidism, Hematoma, Inferior ...",53abea457425db3cfb16f137b5522a6cd65352e3,"[bbc8a4b0ab4c7206c12aad4869f1b7e40f760f1b, 410...",East African medical journal,485-7,77 9,"[3d4db49dd6e18c8611a1b9b844fc9ec14865806d, 695...","OBJECTIVE\nTo study the presentation, manageme...",[https://www.ajol.info/index.php/eamj/article/...,12862139v1,http://pdfs.semanticscholar.org/53ab/ea457425d...,https://semanticscholar.org/paper/53abea457425...,[Medline],Management of undescended testes in children i...,East African medical journal,2000.0
44,"[{'name': 'A Fjose', 'ids': ['6667135']}, {'na...",,,"[Embryo, Embryonic Development, Gastrula, Gene...",70ed891ef8f6965c005e23e290e3ef1285f61c69,"[0a75b4d97ca78217581f97d616d690861e46478a, 181...",Development,71-81,120 1,[],The zebrafish hlx-1 gene belongs to the H2.0 s...,[],7907015v1,,https://semanticscholar.org/paper/70ed891ef8f6...,[Medline],Expression of the zebrafish gene hlx-1 in the ...,Development,1994.0
46,"[{'name': 'Isabel Cantallops', 'ids': ['487433...",10.1038/79823,https://doi.org/10.1038/79823,"[AMPA Receptors, Axon, Eye, Glycosylphosphatid...",7ffb0b7ee2a852abf9697c22922863d58222fe84,"[4fe54b9108a01edd3209477673715d4f0dc9ad59, d28...",Nature Neuroscience,1004-1011,3,"[d6143d36f306ddb916a42319f866a501a1c5503a, c80...",The formation of CNS circuits is characterized...,[http://www.haaslab.com/_pdf/Haas_NatNeuro_00....,11017173v1,,https://semanticscholar.org/paper/7ffb0b7ee2a8...,[Medline],Postsynaptic CPG15 promotes synaptic maturatio...,Nature Neuroscience,2000.0
47,"[{'name': 'Changguan Fan', 'ids': ['2561760']}...",10.1002/spe.4380220706,https://doi.org/10.1002/spe.4380220706,"[Algorithm, Benchmark (computing), Data struct...",e402fbad8900c0ee0b1cb5dc52fa4634d2c01789,"[ba877c268abcf1f18fbc32008cf639127e5ec25a, 69a...","Softw., Pract. Exper.",573-597,22,"[12e5ef2048dcd2be3be53ad460a7ad1ea687c24f, a31...",The ET* algorithm is a complete evaluation str...,"[https://doi.org/10.1002/spe.4380220706, http:...",,http://pdfs.semanticscholar.org/e402/fbad8900c...,https://semanticscholar.org/paper/e402fbad8900...,[DBLP],Extension Table Built-ins for Prolog,"Softw., Pract. Exper.",1992.0
56,"[{'name': 'S L Erlandsen', 'ids': ['3755092']}...",,,"[Cytoskeletal Filaments, Diameter (qualifier v...",016b9f1f9ae2f8d8b97d43947c7a180bdda3d749,[],The Journal of eukaryotic microbiology,416-29,43 5,[],"Encystment of the intestinal protozoan, Giardi...",[],8822813v1,,https://semanticscholar.org/paper/016b9f1f9ae2...,[Medline],Formation of the Giardia cyst wall: studies on...,The Journal of eukaryotic microbiology,1996.0
61,"[{'name': 'M. J. Pébusque', 'ids': ['4609450']...",10.1007/BF01964015,https://doi.org/10.1007/BF01964015,"[Bile Salts, Cellular Structures, Cervical gan...",1101de5f3698e662b3ad93f7b8df29c70f990712,[],Experientia,1370-1372,35,[],The superior cervical ganglia of the rat have ...,[],499428v1,,https://semanticscholar.org/paper/1101de5f3698...,[Medline],Effects of various media on tissular and cellu...,Experientia,1979.0


In [55]:
len(pmr)

6347

In [147]:
df_retracted = pd.read_json('data/retracted_articles', lines=-1)

filter_terms = ('withdraw', 'retract')
filter_terms_2 = ('paper', 'study', 'article', 'publication')

df_retracted = df_retracted[(df_retracted['paperAbstract'].apply(lambda x: any(t1 in x.lower()[:40] and t2 in x.lower()[:40] 
                                                  for t1 in filter_terms
                                                  for t2 in filter_terms_2)) | 
df_retracted['title'].apply(lambda x: any(t1 in x.lower()[:10] for t1 in filter_terms)) |
df_retracted['doi'].apply(lambda x: x in dois) |
df_retracted['pmid'].apply(lambda x: x.split('v')[0] in pmids))]


['id',
 'authors',
 'entities',
 'venue',
 'sources',
 'year',
 'page_length',
 'retracted']

In [152]:
df_retracted['entities'].iloc[0]

['Adriamycin-Bleomycin-Vincristine-Etoposide Regimen (ABVE Regimen)',
 'Adverse reaction to drug',
 'Alkylating Agents',
 'Anthracyclines',
 'Biological Factors',
 'Bleomycin',
 'Cardiopulmonary',
 'Cyclophosphamide',
 'Disease-Free Survival',
 'Gonadal Disorders',
 'Gonadal structure',
 'Hodgkin Disease',
 'Leukemia, Myelocytic, Acute',
 'Leukemogenesis',
 'Low-Dose Treatment',
 'Lymphoma',
 'Lymphoma, Non-Hodgkin',
 'MOPP protocol',
 'Mechlorethamine',
 'Morbidity - disease rate',
 'Myeloid Leukemia',
 'Oncovin-Etoposide-Prednisone-Adriamycin Regimen (OEPA Regimen)',
 'Oncovin-Prednisone-Procarbazine-Adriamycin Regimen (OPPA Regimen)',
 'Parkinson Disease',
 'Patients',
 'Scientific Publication',
 'VAMP regimen (vincristine, doxorubicin, methotrexate and prednisone)',
 'benefit',
 'bleomycin/dacarbazine/doxorubicin/vinblastine protocol',
 'bleomycin/doxorubicin/vinblastine protocol',
 'cyclophosphamide/prednisone/procarbazine/vincristine protocol',
 'leukemia',
 'secondary acute myel