In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('json-rdd').setMaster('local[*]')
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/19 15:39:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc.setLogLevel('ERROR')

In [3]:
sc

In [4]:
sc.defaultParallelism

64

In [5]:
import json
rdd_json = sc.textFile('file:/config/workspace/JsonAnalysis/kaggle/arxiv/dataset.json/arxiv/arxiv-metadata-oai-snapshot.json', 100)
rdd = rdd_json.map(lambda x: json.loads(x))

In [7]:
rdd.take(1)

                                                                                

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [6]:
rdd.getNumPartitions()

113

In [7]:
# get count
rdd.count()

                                                                                

2276611

In [8]:
# get all attributes

rdd.flatMap(lambda x:x.keys()).distinct().collect()

                                                                                

['authors_parsed',
 'license',
 'authors',
 'versions',
 'id',
 'doi',
 'abstract',
 'journal-ref',
 'update_date',
 'title',
 'submitter',
 'categories',
 'report-no',
 'comments']

In [9]:
# get distinct licenses
rdd.map(lambda x: x['license']).distinct().collect()

                                                                                

[None,
 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'http://creativecommons.org/licenses/by-nc-nd/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/4.0/',
 'http://creativecommons.org/publicdomain/zero/1.0/',
 'http://creativecommons.org/licenses/by-sa/4.0/',
 'http://creativecommons.org/licenses/publicdomain/',
 'http://creativecommons.org/licenses/by/3.0/',
 'http://creativecommons.org/licenses/by/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/3.0/']

In [10]:
# get shortest and longest titles
titles_rdd = rdd.map(lambda x: x['title'])

shortest_title = titles_rdd.reduce(lambda x,y: x if len(x) < len(y) else y )
largest_title = titles_rdd.reduce(lambda x,y: x if len(x) > len(y) else y )
print('Shortest Title:', shortest_title)
print('Largest Title:', largest_title)



Shortest Title: 0
Largest Title: Investigation of the 2-body system with a rotating central body (e. g.
  earth-moon system) within the Projective Unified Field theory: the transfer
  of rotational angular momentum and energy from the central body to the
  orbital 2-body system, the tidal and the non-tidal influences (mechanical,
  general-relativistic Lense-Thirring effect and cosmological
  PUFT-contributions)


                                                                                

In [11]:
# find abbreviations with 5 or more letters in the abstract
import re

def get_abbreviations(line):
    result = re.search(r"\(([A-Za-z][^_/\\<>]{5,})\)", line)
    if result:
        return result.group(1)


In [12]:
rdd.filter(lambda x: get_abbreviations(x['abstract'])).count()

                                                                                

816661

In [13]:
rdd.take(1)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [17]:
# Get the number of archive records per month (update_date attribute)
from datetime import datetime

def extract_month(date_string):
    date = datetime.strptime(date_string, '%Y-%m-%d')
    return date.month

rdd_month = rdd.map(lambda x: (extract_month(x['update_date']), 1))

In [18]:
rdd_month.take(10)

[(11, 1),
 (12, 1),
 (1, 1),
 (5, 1),
 (10, 1),
 (5, 1),
 (11, 1),
 (2, 1),
 (3, 1),
 (5, 1)]

In [21]:
rdd_month.reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1]).collect()

                                                                                

[(7, 134445),
 (12, 139647),
 (1, 142071),
 (9, 150865),
 (8, 151084),
 (2, 151162),
 (4, 153724),
 (3, 170341),
 (10, 209438),
 (6, 222547),
 (11, 309797),
 (5, 341490)]

In [23]:
# Get the average number of pages (take from comments)
import re
def get_page(comment):
    if not comment:
        return 0
    search = re.findall('\d+ pages', comment)
    if search:
        return int(search[0].split()[0])
    else:
        return 0

rdd_average = rdd.map(lambda x: get_page(x['comments'])).filter(lambda x: x!= 0)

average_count = rdd_average.count()
average_summation = rdd_average.reduce(lambda x,y: x+y)

print('Average number of pages:', average_summation / average_count)



Average number of pages: 18.06259141637006


                                                                                

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 35114)
Traceback (most recent call last):
  File "/usr/local/conda/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/conda/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/conda/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/conda/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/conda/lib/python3.9/site-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/usr/local/conda/lib/python3.9/site-packages/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/local/conda/lib/python3.9/site-packages/pyspark/accumulators.py", line 239, in ac