# **Arxiv metadata Analytics with PySpark RDD: JSON case study**


In [1]:
!pip3 install -q pyspark
!pip3 install -q findspark

In [2]:
import findspark
findspark.init()

In [3]:
# Initializing Spark

from pyspark import SparkConf, SparkContext
config = SparkConf().setAppName("Arxiv metadata Analytics with RDD").setMaster("local[*]")
sc = SparkContext(conf=config)


In [4]:
sc

In [5]:
# Read and Load Data to Spark
# Data source: https://www.kaggle.com/Cornell-University/arxiv/version/62
rdd_json = sc.textFile("./archive/arxiv-metadata-oai-snapshot.json",100)

import json

rdd = rdd_json.map(lambda x: json.loads(x))

from pyspark import StorageLevel
rdd.persist(StorageLevel.MEMORY_AND_DISK)
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [6]:
# Check the number of parallelism and partitions:
print(sc.defaultParallelism)
print(rdd.getNumPartitions())

2
100


## Question 1: Count elements

In [7]:
rdd.count()

2011231

## Question 2: Get the first two records


In [8]:
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

## Question 3: Get all attributes


In [9]:
rdd.flatMap(lambda x: x).distinct().collect()

['authors',
 'comments',
 'title',
 'id',
 'journal-ref',
 'versions',
 'submitter',
 'categories',
 'update_date',
 'authors_parsed',
 'report-no',
 'license',
 'abstract',
 'doi']

In [10]:
rdd.flatMap(lambda x: x.keys()).distinct().collect()

['authors',
 'comments',
 'title',
 'id',
 'journal-ref',
 'versions',
 'submitter',
 'categories',
 'update_date',
 'authors_parsed',
 'report-no',
 'license',
 'abstract',
 'doi']

## Question 4: Get the name of the licenses

In [11]:
rdd.map(lambda x: x['license']).distinct().collect()

[None,
 'http://creativecommons.org/licenses/publicdomain/',
 'http://creativecommons.org/licenses/by-nc-nd/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/3.0/',
 'http://creativecommons.org/licenses/by/3.0/',
 'http://creativecommons.org/licenses/by/4.0/',
 'http://creativecommons.org/publicdomain/zero/1.0/',
 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'http://creativecommons.org/licenses/by-sa/4.0/']

## Question 5: Get the shortest and the longest titles

In [12]:
longest = rdd.map(lambda x: x['title']).reduce(lambda x,y: x if x > y else y)
longest

"\x7fWeyl \x7fformula for the negative dissipative \x7feigenvalues of Maxwell's\n  equations"

In [13]:
shortest = rdd.map(lambda x: x['title']).reduce(lambda x,y: x if x < y else y)
shortest

'!-Graphs with Trivial Overlap are Context-Free'

### Question 6: Find abbreviations with 5 or more letters in the abstract

In [14]:
import re
def findOutPattern(string):
    result = re.search(r"\(([A-Za-z][^_ /\\<>]{5,})\)",string)
    if result:
        return result.group(1)



In [15]:
rdd.filter(lambda x: findOutPattern(x['abstract'])).take(10)

[{'id': '0704.0055',
  'submitter': 'Per Hyldgaard',
  'authors': 'Eleni Ziambaras, Jesper Kleis, Elsebeth Schroder, and Per Hyldgaard',
  'title': 'Potassium intercalation in graphite: A van der Waals density-functional\n  study',
  'comments': '10 pages, 5 figures',
  'journal-ref': None,
  'doi': '10.1103/PhysRevB.76.155425',
  'report-no': None,
  'categories': 'cond-mat.soft cond-mat.mtrl-sci',
  'license': None,
  'abstract': '  Potassium intercalation in graphite is investigated by first-principles\ntheory. The bonding in the potassium-graphite compound is reasonably well\naccounted for by traditional semilocal density functional theory (DFT)\ncalculations. However, to investigate the intercalate formation energy from\npure potassium atoms and graphite requires use of a description of the graphite\ninterlayer binding and thus a consistent account of the nonlocal dispersive\ninteractions. This is included seamlessly with ordinary DFT by a van der Waals\ndensity functional (vdW-DF

## Question 7: Get the number of archive records per month ('update_date' attribute)

In [16]:
def extract_months(date_str):
    match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_str)
    print(match)
    if match:
        month = int(match.group(2))  # Convert the matched month to an integer
        return month
    

In [17]:
rdd.map(lambda x : (extract_months(x['update_date']),1)).reduceByKey(lambda x,y: x+y).collect()

[(1, 134247),
 (2, 116948),
 (3, 126458),
 (4, 117126),
 (5, 296587),
 (6, 191746),
 (7, 122649),
 (8, 138469),
 (9, 138978),
 (10, 197755),
 (11, 297963),
 (12, 132305)]

In [18]:
# This approach is faster, since datetime is a built-in Python method optimized for parsing date strings
import datetime

def extract_date(line):
    d = datetime.datetime.strptime(line, "%Y-%m-%d")
    return d.month

extract_date('2000-01-01')

1

In [19]:
rdd.map(lambda x: (extract_date(x['update_date']),1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1]).collect()

[(2, 116948),
 (4, 117126),
 (7, 122649),
 (3, 126458),
 (12, 132305),
 (1, 134247),
 (8, 138469),
 (9, 138978),
 (6, 191746),
 (10, 197755),
 (5, 296587),
 (11, 297963)]

## Question 8: Get the average number of pages

In [20]:
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [21]:
# import re
line = "37 pages"

def extract_Page(line):
    
    search = re.findall("\d+ pages", line)
    if search:
        return int(search[0].split(" ")[0])
    else:
        return 0
extract_Page(line)

37

In [22]:
rdd_average = rdd.map(lambda x: extract_Page(x['comments'] if x['comments'] != None else "None"))

In [23]:
rdd_average = rdd_average.filter(lambda x: x != 0)

In [24]:
average_sum = rdd_average.reduce(lambda x,y: x+y)
average_count = rdd_average.count()

In [25]:
round(average_sum/average_count)

18

In [26]:
# Get The "Average number of pages per day for version 1"

In [27]:
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [28]:
rdd.map(lambda x: x['versions'][0]['version']).take(5)

['v1', 'v1', 'v1', 'v1', 'v1']

In [29]:
import re
line = '37 pages'
def extract_pages_ch(line):
    res = re.findall(r'(\d+) pages',line)
    if res:
        return int(res[0])
    else:
        return 0
extract_pages_ch(line)

37

In [41]:
ch_rdd = rdd.filter(lambda x: x['versions'][0]['version'] == 'v1').map(lambda x: (" ".join(x['versions'][0]['created'].split(" ")[1:4]),extract_pages_ch(x['comments']) if x['comments'] != None else 0))

In [52]:
rdd.take(1)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [68]:
ch_rdd = rdd.filter(lambda x: x['versions'][0]['version'] == 'v1').map(lambda x: ((x['versions'][0]['created'].split(",")[0]),extract_pages_ch(x['comments']) if x['comments'] != None else 0))

In [69]:
ch_rdd = ch_rdd.filter(lambda x: x[1] != 0)

In [70]:
rdd.map(lambda x: x['versions'][0]['created'].split(",")[0]).take(6)

['Mon', 'Sat', 'Sun', 'Sat', 'Mon', 'Sat']

In [71]:
ch_rdd_avg = ch_rdd.reduceByKey(lambda x,y : x+y)

In [72]:
ch_rdd_avg


PythonRDD[129] at RDD at PythonRDD.scala:53

In [73]:
ch_rdd_count = ch_rdd.map(lambda x: (x[0],1)).reduceByKey(lambda x,y : x+y)

In [74]:
ch_rdd_count

PythonRDD[134] at RDD at PythonRDD.scala:53

In [75]:
ch_joined = ch_rdd_count.join(ch_rdd_avg)

In [76]:
ch_joined.persist()

PythonRDD[138] at RDD at PythonRDD.scala:53

In [77]:
ch_joined.persist()
ch_joined.take(10)

[('Fri', (186742, 3295737)),
 ('Sun', (68141, 1256685)),
 ('Sat', (65750, 1175554)),
 ('Thu', (210916, 3751476)),
 ('Tue', (219982, 3916858)),
 ('Wed', (214576, 3823418)),
 ('Mon', (217968, 3919788))]

In [80]:
ch_average_res = ch_joined.map(lambda x: (x[1][1]/x[1][0],x[0])).sortByKey(False)

In [81]:
ch_average_res.collect()

[(18.44242086262309, 'Sun'),
 (17.983318652279234, 'Mon'),
 (17.879148288973383, 'Sat'),
 (17.818479233465066, 'Wed'),
 (17.80535680192016, 'Tue'),
 (17.78658802556468, 'Thu'),
 (17.648611453234945, 'Fri')]