In [5]:
import xml
import requests
import pandas as pd
import numpy as np
import networkx as nx
import nxpd
import matplotlib.pyplot as plt
import matplotlib
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import dataframe
import graphframes

from src.data_reader import DataReader
# from src.to_gexf import to_gexf
from src.get_redacted import get_paper, load_redacted, get_doi


if 'written' in globals(): del written

In [2]:
reader = DataReader('data/s2-corpus-00')

In [3]:
reader.write(50) if not 'written' in globals() else None
written = True

In [4]:
edges = spark.createDataFrame(reader.read_edges(), schema=StructType(
[
    StructField(name='src', dataType=StringType()),
    StructField(name='dst', dataType=StringType())
]
)).withColumn('id', F.monotonically_increasing_id())


nodes = spark.createDataFrame(reader.read_nodes(), schema=StructType(
[
    StructField(name='id', dataType=StringType()),
    StructField(name='year', dataType=StringType()),
    StructField(name='authors', dataType=StringType()),
    StructField(name='entities', dataType=StringType())
]
))


In [5]:
g = graphframes.GraphFrame(e=edges, v=nodes)
to_gexf(g, 'data/graph2.gexf')

In [None]:
page_rank = g.pageRank(maxIter=5, resetProbability=0.9)

In [None]:
page_rank.edges.show()

In [None]:
e = g.edges.toPandas().iloc[:, [0,1]]
e.columns = ['source', 'target']
G = nx.DiGraph(e)

In [None]:
nxpd.draw(G)

In [39]:
df = pd.read_json(path_or_buf='data/retracted', lines=-1)
df = df[df.error.isnull()]
df.drop('error', axis=1, inplace=True)

In [50]:
df.references

0                                                      []
1                                                      []
2       [{'arxivId': None, 'authors': [{'authorId': '5...
3                                                      []
4       [{'arxivId': None, 'authors': [{'authorId': No...
5       [{'arxivId': None, 'authors': [{'authorId': '5...
6                                                      []
7                                                      []
8                                                      []
9                                                      []
10                                                     []
11                                                     []
12      [{'arxivId': None, 'authors': [{'authorId': '4...
13      [{'arxivId': None, 'authors': [{'authorId': No...
14                                                     []
15                                                     []
16                                                     []
17      [{'arx

In [55]:
df.inCitations[3692]

{'1258': [],
 '42534': ['dabc3c8815de76bfa2799a9667259aaa98d5c498',
  '8936b6830ecd034671eef7773c20b6728b5b7201',
  '003374ef65a9f2f981f9f9027bb82819ce7beb3b',
  'c9599575f0d79bab1730cb854f5a42fb5db4ba7e',
  '80cfc7d32b8103f70795ce8b8c19af33cbe4d9eb'],
 '119252': [],
 '146135': [],
 '147922': [],
 '164138': ['a7d718a8c2595fcc6779d6f1c20dbc023159b919',
  '1ec15f64a6929389145cbb73beb856bb3775e44d',
  'a4d3e156705759fb63de20d2ed1308c01b846e16',
  '5255d419f8440bd5ba2eed4cf4d46e661d948681',
  '6b9734de2202552f906b6c3532811f618967dd8d'],
 '174028': [],
 '286242': [],
 '314925': ['c7d3d39f0f6ca018aa74f876386bb375dc1fd179',
  'b87c31978ba16856b7aa3cca79a0923489d4a155'],
 '339873': [],
 '377921': ['f5ae20c502baf961815a5178b4a272d1ef12702b',
  'c298739be9b82524591d7aa1f070ab4db1ecaced'],
 '420925': [],
 '448459': [],
 '468617': [],
 '556418': ['bda27f309679d01f30b949923893e90336ad139f',
  '0ff9e94eac5c7febda748d17576bdffdfc947fb6',
  '8b43c16c34d607a65b4fee4cf2fad60bf4dc3805'],
 '595906': ['076