<a href="https://colab.research.google.com/github/FibGro/Pyspark/blob/main/JSON_using_DF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=6b2abf93a2c340093761ca1bd4acd348119ac1bb1e16287a50305781ef9e989b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [8]:
df = spark.read.json('arxiv-metadata-oai-snapshot.json')
df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



In [10]:
df.rdd.getNumPartitions()

2

In [38]:
# Create a new Schema

from pyspark.sql.types import *


Schema = StructType([
    StructField('authors', StringType(), True),
    StructField('categories', StringType(), True),
    StructField('license', StringType(), True),
    StructField('abstract', StringType(), True),
    StructField('comments', StringType(), True),
    StructField('versions', ArrayType(StringType()), True)


])

print(Schema)

StructType([StructField('authors', StringType(), True), StructField('categories', StringType(), True), StructField('license', StringType(), True), StructField('abstract', StringType(), True), StructField('comments', StringType(), True), StructField('versions', ArrayType(StringType(), True), True)])


In [42]:
# Binding data into a Schema

df = spark.read.json('arxiv-metadata-oai-snapshot.json', schema = Schema)
df.show()

+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             authors|       categories|             license|            abstract|            comments|            versions|
+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|C. Bal\'azs, E. L...|           hep-ph|                NULL|  A fully differe...|37 pages, 15 figu...|[{"version":"v1",...|
|Ileana Streinu an...|    math.CO cs.CG|http://arxiv.org/...|  We describe a n...|To appear in Grap...|[{"version":"v1",...|
|         Hongjun Pan|   physics.gen-ph|                NULL|  The evolution o...| 23 pages, 3 figures|[{"version":"v1",...|
|        David Callan|          math.CO|                NULL|  We show that a ...|            11 pages|[{"version":"v1",...|
|Wael Abu-Shammala...|  math.CA math.FA|                NULL|  In this paper w...|                NULL|[{"version":"v1",...|


In [45]:
# Gt author names who published a [a[er in a 'math' category

# register DF to be used in SparkSQL

df.createOrReplaceTempView('Archive')

sql_query = """ SELECT authors
                FROM Archive
                WHERE categories LIKE 'math%' """

sql_query_df = spark.sql(sql_query)
sql_query_df.show()

+--------------------+
|             authors|
+--------------------+
|Ileana Streinu an...|
|        David Callan|
|Wael Abu-Shammala...|
|  Sergei Ovchinnikov|
|Clifton Cunningha...|
|         Dohoon Choi|
|Dohoon Choi and Y...|
|        Koichi Fujii|
|         Norio Konno|
|Simon J.A. Malham...|
|Robert P. C. de M...|
|  P\'eter E. Frenkel|
|          Mihai Popa|
|   Debashish Goswami|
|      Mikkel {\O}bro|
|Nabil L. Youssef,...|
|Wael Abu-Shammala...|
|         Boris Rubin|
|         A. I. Molev|
| Branko J. Malesevic|
+--------------------+
only showing top 20 rows



In [46]:
# # Get license with 5 or more letters in the abstract

sql_query = """ SELECT distinct(license)
                FROM Archive
                WHERE abstract REGEXP '%\(([A-Za-z][^_?\\<>]{5,})\)%' """

sql_query_df = spark.sql(sql_query)
sql_query_df.show()

+--------------------+
|             license|
+--------------------+
|http://arxiv.org/...|
|http://creativeco...|
|http://creativeco...|
|                NULL|
+--------------------+



In [47]:
# Extract the statitics of the number of pages for unknown licenses

# Get the average of pages

import re

def get_page(line):
  search = re.findall('\d+ pages', line)
  if search:
    return int(search[0].split(' ')[0])
  else:
    return 0


In [51]:
# Extract the statitics of the number of pages for unknown licenses

# Get the average of pages

import re

def get_page(line):
  if line is not None: # Check if line is not null
    search = re.findall('\d+ pages', line)
    if search:
      return int(search[0].split(' ')[0])
  return 0 # Return 0 for null or no match cases

# Register the udf

spark.udf.register('PageNumbers', get_page)

# Corrected SQL query with single quotes
sql_query = """ SELECT AVG(PageNumbers(comments)) AS avg_pages,
                  SUM(PageNumbers(comments)) AS sum_pages
                  FROM Archive
                  WHERE license IS NULL """

sql_query_df = spark.sql(sql_query)
sql_query_df.show()

+-----------------+---------+
|        avg_pages|sum_pages|
+-----------------+---------+
|10.99002891844997| 475044.0|
+-----------------+---------+

