In [83]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import format_number, mean, min, max, corr
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import explode
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, element_at, size, split
from pyspark.sql.functions import udf 
from pyspark.sql.functions import count
from pyspark.sql.functions import length

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.set('spark.executor.memory', '8g')
SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName("fp").getOrCreate()
spark.conf

<pyspark.sql.conf.RuntimeConfig at 0x7f65ea3b2190>

In [4]:
papers_ = spark.read.option("multiLine", True).option("mode", "PERMISSIVE").option("encoding", "ascii").json("finalProjData.json")
papers = papers_.select(explode(col("hits.hits")).alias("paper"))

In [5]:
 def ascii_ignore(x):
    return x.encode('ascii', 'ignore').decode('ascii')
ascii_udf = udf(ascii_ignore)

In [17]:
short_papers = papers.select(
    element_at(col("paper.metadata.titles.title"), 1).alias("title"),
    element_at(col("paper.metadata.abstracts.value"), 1).alias("abstract"),
    col("paper.created"), col("paper.metadata.number_of_pages"),
    col("paper.metadata.keywords"), size(
        col("paper.metadata.references")).alias("num_refs"),
    col("paper.metadata.authors.full_name").alias("authors")
).withColumn("title", ascii_udf("title"));
short_papers.printSchema()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- created: string (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- num_refs: integer (nullable = false)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [22]:
short_papers.select("keywords").show(15)

+--------------------+
|            keywords|
+--------------------+
|[{INSPIRE, classi...|
|[{INSPIRE, null, ...|
|[{INSPIRE, classi...|
|[{INSPIRE, classi...|
|[{INSPIRE, null, ...|
|[{null, author, P...|
|[{INSPIRE, null, ...|
|[{INSPIRE, classi...|
|[{INSPIRE, null, ...|
|[{INSPIRE, null, ...|
|[{INSPIRE, null, ...|
|[{INSPIRE, classi...|
|[{INSPIRE, classi...|
|[{null, author, p...|
|[{INSPIRE, null, ...|
+--------------------+
only showing top 15 rows



In [23]:
short_papers.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|Improving heavy D...|In this work, by ...|2022-01-03T02:41:...|             26|[{INSPIRE, classi...|     113|[Feng, Jie, Li, M...|
|Analytic solution...|We have derived t...|2022-01-03T05:20:...|             28|[{INSPIRE, null, ...|     143|[Wang, Dong-Lin, ...|
|Meson structure o...|This is the third...|2022-01-03T02:41:...|             36|[{INSPIRE, classi...|      39|[Shuryak, Edward,...|
|Coupled-channel a...|We study the proc...|2022-01-03T02:44:...|             10|[{INSPIRE, classi...|      30|[Surovtsev, Yury ...|
|Spinodal Gravitat...|We uncover a new ...|2022-01-03T02:48:...|            

In [25]:
#Adding an ID to each paper so the abstract data analysis can be attributed to a paper
from pyspark.sql.functions import monotonically_increasing_id
papersWIDs = short_papers.withColumn("id", monotonically_increasing_id())

In [26]:
papersWIDs.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors| id|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|Improving heavy D...|In this work, by ...|2022-01-03T02:41:...|             26|[{INSPIRE, classi...|     113|[Feng, Jie, Li, M...|  0|
|Analytic solution...|We have derived t...|2022-01-03T05:20:...|             28|[{INSPIRE, null, ...|     143|[Wang, Dong-Lin, ...|  1|
|Meson structure o...|This is the third...|2022-01-03T02:41:...|             36|[{INSPIRE, classi...|      39|[Shuryak, Edward,...|  2|
|Coupled-channel a...|We study the proc...|2022-01-03T02:44:...|             10|[{INSPIRE, classi...|      30|[Surovtsev, Yury ...|  3|
|Spinodal Gravitat...|We uncover a new ...|2022-

In [43]:
#drop papers with no abstracts
papersWIDs = papersWIDs.dropna()

In [61]:
#Creates a new DF with the papers ID, and each word in the abstract alongside its position in the abstract
words = papersWIDs.select(
        "id",
        F.split("abstract", " ").alias("abstractWords"),
        F.posexplode(F.split("abstract", " ")).alias("position", "word")
    )
words.show(50)

+---+--------------------+--------+-------------+
| id|       abstractWords|position|         word|
+---+--------------------+--------+-------------+
|  0|[In, this, work,,...|       0|           In|
|  0|[In, this, work,,...|       1|         this|
|  0|[In, this, work,,...|       2|        work,|
|  0|[In, this, work,,...|       3|           by|
|  0|[In, this, work,,...|       4|        using|
|  0|[In, this, work,,...|       5|          the|
|  0|[In, this, work,,...|       6|      machine|
|  0|[In, this, work,,...|       7|     learning|
|  0|[In, this, work,,...|       8|     methods,|
|  0|[In, this, work,,...|       9|           we|
|  0|[In, this, work,,...|      10|        study|
|  0|[In, this, work,,...|      11|          the|
|  0|[In, this, work,,...|      12|sensitivities|
|  0|[In, this, work,,...|      13|           of|
|  0|[In, this, work,,...|      14|        heavy|
|  0|[In, this, work,,...|      15| pseudo-Dirac|
|  0|[In, this, work,,...|      16|     neutrino|


In [75]:
#pair each paper with their word counts
abs_word_count = words.select('id', 'word').groupBy('id','word').agg({"word": "count"})
abs_word_count.sort(col("id"),col("count(word)")).show()

+---+-------------+-----------+
| id|         word|count(word)|
+---+-------------+-----------+
|  0|   Perceptron|          1|
|  0|         this|          1|
|  0|       signal|          1|
|  0|      events.|          1|
|  0|  observables|          1|
|  0|     separate|          1|
|  0|           27|          1|
|  0|        found|          1|
|  0|           1$|          1|
|  0|      missing|          1|
|  0|   transverse|          1|
|  0|           In|          1|
|  0|         from|          1|
|  0|        work,|          1|
|  0|        (with|          1|
|  0|        boson|          1|
|  0|      charged|          1|
|  0|sensitivities|          1|
|  0|           or|          1|
|  0|          100|          1|
+---+-------------+-----------+
only showing top 20 rows



In [91]:
#Drop rows that have words less than 4 characters long or that contain dollar signs
filtered = abs_word_count.filter(length(abs_word_count.word)>4)
filtered = filtered.filter(~filtered.word.contains('$'))
filtered.sort(col("id")).show()

+---+---------------+-----------+
| id|           word|count(word)|
+---+---------------+-----------+
|  0|        Boosted|          1|
|  0|     background|          1|
|  0|        events.|          1|
|  0|       neutrino|          4|
|  0|     Perceptron|          1|
|  0|          found|          1|
|  0|         signal|          1|
|  0|     production|          1|
|  0|    observables|          1|
|  0|         mixing|          1|
|  0|     transverse|          1|
|  0|          boson|          1|
|  0|          (with|          1|
|  0|  sensitivities|          1|
|  0|        charged|          1|
|  0|       Gradient|          1|
|  0|discrimination.|          1|
|  0|    high-energy|          1|
|  0|       separate|          1|
|  0|      kinematic|          1|
+---+---------------+-----------+
only showing top 20 rows



In [49]:
new.show()

+--------------------+--------------------+
|                Hits|                 col|
+--------------------+--------------------+
|[{[{2022-01-03T02...|[{2022-01-03T02:4...|
+--------------------+--------------------+



In [50]:
new.printSchema()

root
 |-- Hits: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- hits: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- created: string (nullable = true)
 |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |-- links: struct (nullable = true)
 |    |    |    |    |    |-- bibtex: string (nullable = true)
 |    |    |    |    |    |-- citations: string (nullable = true)
 |    |    |    |    |    |-- json: string (nullable = true)
 |    |    |    |    |    |-- latex-eu: string (nullable = true)
 |    |    |    |    |    |-- latex-us: string (nullable = true)
 |    |    |    |    |-- metadata: struct (nullable = true)
 |    |    |    |    |    |-- $schema: string (nullable = true)
 |    |    |    |    |    |-- _oai: struct (nullable = true)
 |    |    |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |    |    |-- sets: array (nullable = true)
 |  