In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import format_number, mean, min, max, corr
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import explode
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, element_at, size, split
from pyspark.sql.functions import udf 
from pyspark.sql.functions import count
from pyspark.sql.functions import length

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.set('spark.executor.memory', '8g')
SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName("fp").getOrCreate()
spark.conf

<pyspark.sql.conf.RuntimeConfig at 0x7ff968dbc160>

In [4]:
papers_ = spark.read.option("multiLine", True).option("mode", "PERMISSIVE").option("encoding", "ascii").json("../data/AL_papers.json")
papers = papers_.select(explode(col("hits.hits")).alias("paper"))

In [5]:
 def ascii_ignore(x):
    return x.encode('ascii', 'ignore').decode('ascii')
ascii_udf = udf(ascii_ignore)

In [6]:
short_papers = papers.select(
    element_at(col("paper.metadata.titles.title"), 1).alias("title"),
    element_at(col("paper.metadata.abstracts.value"), 1).alias("abstract"),
    col("paper.created"), col("paper.metadata.number_of_pages"),
    col("paper.metadata.keywords"), size(
        col("paper.metadata.references")).alias("num_refs"),
    col("paper.metadata.authors.full_name").alias("authors")
).withColumn("title", ascii_udf("title"));
short_papers.printSchema()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- created: string (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- num_refs: integer (nullable = false)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [7]:
short_papers.select("keywords").show(15, truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|keywords                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [8]:
short_papers.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|Charmonia product...|In this paper, pr...|2021-03-26T00:00:...|             15|[{PACS, null, 14....|      35|[Luchinsky, A.V.,...|
|Exclusive decays ...|Exclusive decays ...|2020-07-09T00:00:...|             10|[{null, publisher...|      42|[Luchinsky, A.V.,...|
|Doubly heavy bary...|The theoretical a...|2019-12-11T00:00:...|              8|[{INSPIRE, null, ...|      40|[Berezhnoy, A.V.,...|
|Weak decays of do...|We consider exclu...|2019-05-29T00:00:...|             10|[{null, publisher...|      21|[Gerasimov, A.S.,...|
|$B_c$ excitations...|Status of the Bc ...|2019-11-21T00:00:...|            

# Add an ID to Each Unique Paper in our dataset

In [11]:
#Adding an ID to each paper so the abstract data analysis can be attributed to a paper
from pyspark.sql.functions import monotonically_increasing_id
papersWIDs = short_papers.withColumn("id", monotonically_increasing_id())

In [12]:
papersWIDs.printSchema()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- created: string (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- num_refs: integer (nullable = false)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = false)



In [13]:
papersWIDs.show();

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors| id|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|Charmonia product...|In this paper, pr...|2021-03-26T00:00:...|             15|[{PACS, null, 14....|      35|[Luchinsky, A.V.,...|  0|
|Exclusive decays ...|Exclusive decays ...|2020-07-09T00:00:...|             10|[{null, publisher...|      42|[Luchinsky, A.V.,...|  1|
|Doubly heavy bary...|The theoretical a...|2019-12-11T00:00:...|              8|[{INSPIRE, null, ...|      40|[Berezhnoy, A.V.,...|  2|
|Weak decays of do...|We consider exclu...|2019-05-29T00:00:...|             10|[{null, publisher...|      21|[Gerasimov, A.S.,...|  3|
|$B_c$ excitations...|Status of the Bc ...|2019-

# Data Manipulation - Create a dataframe with abstract word counts

In [14]:
#drop papers with no abstracts
papersWIDs = papersWIDs.dropna()

In [15]:
#Creates a new DF with the papers ID, and each word in the abstract alongside its position in the abstract
words = papersWIDs.select(
        "id",
        F.split("abstract", " ").alias("abstractWords"),
        F.posexplode(F.split("abstract", " ")).alias("position", "word")
    )
#words.show(50)

In [16]:
words.printSchema()

root
 |-- id: long (nullable = false)
 |-- abstractWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- position: integer (nullable = false)
 |-- word: string (nullable = true)



In [18]:
words.show(50)

+---+--------------------+--------+----------------+
| id|       abstractWords|position|            word|
+---+--------------------+--------+----------------+
|  0|[In, this, paper,...|       0|              In|
|  0|[In, this, paper,...|       1|            this|
|  0|[In, this, paper,...|       2|          paper,|
|  0|[In, this, paper,...|       3|      production|
|  0|[In, this, paper,...|       4|              of|
|  0|[In, this, paper,...|       5|      charmonium|
|  0|[In, this, paper,...|       6|           state|
|  0|[In, this, paper,...|       7|            ����|
|  0|[In, this, paper,...|       8|              in|
|  0|[In, this, paper,...|       9|       exclusive|
|  0|[In, this, paper,...|      10|               W|
|  0|[In, this, paper,...|      11|  �������Ds(���)|
|  0|[In, this, paper,...|      12|          decays|
|  0|[In, this, paper,...|      13|              is|
|  0|[In, this, paper,...|      14|        analyzed|
|  0|[In, this, paper,...|      15|           

In [19]:
#pair each paper with their word counts
abs_word_count = words.select('id', 'word').groupBy('id','word').agg({"word": "count"})
abs_word_count.sort(col("id"),col("count(word)")).show()

+---+----------+-----------+
| id|      word|count(word)|
+---+----------+-----------+
|  0|     turns|          1|
|  0| estimates|          1|
|  0|      into|          1|
|  0|     final|          1|
|  0| presented|          1|
|  0|charmonium|          1|
|  0|   numbers|          1|
|  0|  internal|          1|
|  0|      ones|          1|
|  0|production|          1|
|  0|      ����|          1|
|  0|        so|          1|
|  0|        In|          1|
|  0|      than|          1|
|  0|     could|          1|
|  0|        or|          1|
|  0|         W|          1|
|  0|  analyzed|          1|
|  0|    effect|          1|
|  0| framework|          1|
+---+----------+-----------+
only showing top 20 rows



In [20]:
#Drop rows that have words less than 4 characters long or that contain dollar signs
filtered = abs_word_count.filter(length(abs_word_count.word)>4)
filtered = filtered.filter(~filtered.word.contains('$'))
filtered = filtered.filter(~filtered.word.contains('^'))
filtered = filtered.filter(~filtered.word.contains('}'))
filtered.sort(col("id"),col("count(word)").desc()).show(100)

+---+----------------+-----------+
| id|            word|count(word)|
+---+----------------+-----------+
|  0|          decays|          2|
|  0|       branching|          2|
|  0|           these|          2|
|  0|   contributions|          2|
|  0|       fractions|          2|
|  0|           could|          1|
|  0|       estimates|          1|
|  0|   significantly|          1|
|  0|         numbers|          1|
|  0|           rough|          1|
|  0|        account.|          1|
|  0|           state|          1|
|  0|         quantum|          1|
|  0|         (NRQCD)|          1|
|  0|          paper,|          1|
|  0|       presented|          1|
|  0|        analyzed|          1|
|  0|           taken|          1|
|  0|     particles),|          1|
|  0|  (approximately|          1|
|  0|           order|          1|
|  0|  �������Ds(���)|          1|
|  0|           value|          1|
|  0|       expansion|          1|
|  0|         leading|          1|
|  0|         models

In [21]:
filtered = filtered.select("id", "count(word)", F.translate(F.col("word"), ".,", "").alias("replaced"))
filtered.sort(col("id"),col("count(word)").desc()).show(100)

+---+-----------+----------------+
| id|count(word)|        replaced|
+---+-----------+----------------+
|  0|          2|          decays|
|  0|          2|       branching|
|  0|          2|           these|
|  0|          2|   contributions|
|  0|          2|       fractions|
|  0|          1|           could|
|  0|          1|       estimates|
|  0|          1|   significantly|
|  0|          1|         numbers|
|  0|          1|           rough|
|  0|          1|         account|
|  0|          1|           state|
|  0|          1|         quantum|
|  0|          1|         (NRQCD)|
|  0|          1|           paper|
|  0|          1|       presented|
|  0|          1|        analyzed|
|  0|          1|           taken|
|  0|          1|      particles)|
|  0|          1|  (approximately|
|  0|          1|           order|
|  0|          1|  �������Ds(���)|
|  0|          1|           value|
|  0|          1|       expansion|
|  0|          1|         leading|
|  0|          1|   

# Data Manipulation - Find a classification of the paper 

Each paper has a classification in their keyword. In order to create a model that is predictive, we will need to train a model to classify. If my understanding is correct, the first keyword is probably the most important classifier.

In [22]:
#I don't know that I'm going to use these title word splits but just in case I'm going to leave this here.
keyWords = papersWIDs.select(
        "id", "number_of_pages", "authors", "keywords",
        F.split("title", " ").alias("titleWords"),
        F.posexplode(F.split("title", " ")).alias("position", "word")
    )
keyWords.show(50)

+---+---------------+--------------------+--------------------+--------------------+--------+------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|        word|
+---+---------------+--------------------+--------------------+--------------------+--------+------------+
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       0|   Charmonia|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       1|  production|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       2|          in|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       3|          $W|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       4|         \to|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       5|      (c\bar|
|  0|             15|[Luchinsky, A.V.

In [23]:
#creates a new column that
keyWords = keyWords.withColumn("new", keyWords["keywords"].getItem(0))
keyWords.show()

+---+---------------+--------------------+--------------------+--------------------+--------+----------+--------------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|      word|                 new|
+---+---------------+--------------------+--------------------+--------------------+--------+----------+--------------------+
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       0| Charmonia|{PACS, null, 14.4...|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       1|production|{PACS, null, 14.4...|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       2|        in|{PACS, null, 14.4...|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       3|        $W|{PACS, null, 14.4...|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       4|       \to|{PACS, null, 1

In [24]:
classification =  keyWords.withColumn("classification", keyWords["new"].getItem("value"))
classification = classification.drop(col("new"))
classification.show()

+---+---------------+--------------------+--------------------+--------------------+--------+----------+-------------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|      word|     classification|
+---+---------------+--------------------+--------------------+--------------------+--------+----------+-------------------+
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       0| Charmonia|           14.40.Pq|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       1|production|           14.40.Pq|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       2|        in|           14.40.Pq|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       3|        $W|           14.40.Pq|
|  0|             15|[Luchinsky, A.V.,...|[{PACS, null, 14....|[Charmonia, produ...|       4|       \to|           14.40.Pq|
