In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('json-df').getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/19 16:16:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/19 16:16:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
spark.sparkContext.setLogLevel('ERROR')

In [6]:
df = spark.read.json('file:/config/workspace/JsonAnalysis/kaggle/arxiv/dataset.json/arxiv/arxiv-metadata-oai-snapshot.json')
df.printSchema()



root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



                                                                                

In [7]:
df.rdd.getNumPartitions()

64

In [20]:
from pyspark.sql.types import *

Schema = StructType([
    StructField('abstract', StringType(), True),
    StructField('authors', StringType(), True),
    StructField('categories', StringType(), True),
    StructField('license', StringType(), True),
    StructField('comments', StringType(), True),
    StructField('versions', ArrayType(StringType()), True)
])

In [21]:
df = spark.read.json('file:/config/workspace/JsonAnalysis/kaggle/arxiv/dataset.json/arxiv/arxiv-metadata-oai-snapshot.json', schema=Schema)

In [22]:
df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
df.count()

                                                                                

2276611

In [24]:
df.filter('comments is null').count()

                                                                                

536123

In [None]:
# drop the records where comments is null and replace null license with 'Unknown'

In [25]:
df = df.dropna(subset=['comments'])
df.count()

                                                                                

1740488

In [26]:
2276611 - 536123

1740488

In [28]:
df.filter('license is null').count()

                                                                                

422038

In [29]:
df.filter('license = "Unknown"').count()

                                                                                

0

In [30]:
df = df.fillna(value='Unknown',subset=['license'])

In [31]:
df.filter('license = "Unknown"').count()

                                                                                

422038

In [35]:
# Get the author names who published a paper in math category

df.select('categories').show()

+--------------------+
|          categories|
+--------------------+
|              hep-ph|
|       math.CO cs.CG|
|      physics.gen-ph|
|             math.CO|
|   cond-mat.mes-hall|
|               gr-qc|
|   cond-mat.mtrl-sci|
|             math.CO|
|     math.NT math.AG|
|     math.CA math.AT|
|              hep-th|
|              hep-ph|
|            astro-ph|
|              hep-th|
|     math.PR math.AG|
|              hep-ex|
|nlin.PS physics.c...|
|             math.NA|
|            astro-ph|
|             nlin.PS|
+--------------------+
only showing top 20 rows



In [43]:
df.filter('categories like "math%"').select('authors').show()

+--------------------+
|             authors|
+--------------------+
|Ileana Streinu an...|
|        David Callan|
|  Sergei Ovchinnikov|
|Clifton Cunningha...|
|        Koichi Fujii|
|         Norio Konno|
|Simon J.A. Malham...|
|Robert P. C. de M...|
|  P\'eter E. Frenkel|
|          Mihai Popa|
|   Debashish Goswami|
|      Mikkel {\O}bro|
|Nabil L. Youssef,...|
|         Boris Rubin|
|         A. I. Molev|
| Branko J. Malesevic|
|   John W. Robertson|
|     Yu.N. Kosovtsov|
|        Osamu Fujino|
|Stephen C. Power ...|
+--------------------+
only showing top 20 rows



In [42]:
df.createOrReplaceTempView('Archive')

spark.sql("""
        select authors from Archive where categories like 'math%'
""").count()

                                                                                

336050

In [48]:
# Get licenses with 5 or more letters in the abstract

spark.sql("""
    select distinct license from Archive where abstract regexp '%\(([A-Za-z][^_/\\<>]{5,})\)%'
""").show()



+--------------------+
|             license|
+--------------------+
|             Unknown|
|http://arxiv.org/...|
|http://creativeco...|
|http://creativeco...|
|http://creativeco...|
|http://creativeco...|
|http://creativeco...|
|http://creativeco...|
+--------------------+



                                                                                

In [49]:
# Extract the statistics of th number of pages for Unknown datasets

import re
def get_page(comment):
    if not comment:
        return 0
    search = re.findall('\d+ pages', comment)
    if search:
        return int(search[0].split()[0])
    else:
        return 0

spark.udf.register('PageNumbers', get_page)

spark.sql("""
        select avg(PageNumbers(comments)) as _avg, sum(PageNumbers(comments)) as _sum,
        std(PageNumbers(comments)) as _std from Archive where license = 'Unknown'

""").show()

                                                                                

+------------------+---------+-----------------+
|              _avg|     _sum|             _std|
+------------------+---------+-----------------+
|13.366812941014791|5641303.0|16.77684387442691|
+------------------+---------+-----------------+

