In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import format_number, mean, min, max, corr
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import explode
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, element_at, size, split
from pyspark.sql.functions import udf 
from pyspark.sql.functions import count
from pyspark.sql.functions import length

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.set('spark.executor.memory', '8g')
SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName("fp").getOrCreate()
spark.conf

<pyspark.sql.conf.RuntimeConfig at 0x7f2a252a2430>

In [5]:
papers_ = spark.read.option("multiLine", True).option("mode", "PERMISSIVE").option("encoding", "ascii").json("../data/AL_papers.json")
papers = papers_.select(explode(col("hits.hits")).alias("paper"))

In [6]:
 def ascii_ignore(x):
    return x.encode('ascii', 'ignore').decode('ascii')
ascii_udf = udf(ascii_ignore)

In [7]:
short_papers = papers.select(
    element_at(col("paper.metadata.titles.title"), 1).alias("title"),
    element_at(col("paper.metadata.abstracts.value"), 1).alias("abstract"),
    col("paper.created"), col("paper.metadata.number_of_pages"),
    col("paper.metadata.keywords"), size(
        col("paper.metadata.references")).alias("num_refs"),
    col("paper.metadata.authors.full_name").alias("authors")
).withColumn("title", ascii_udf("title"));
short_papers.printSchema()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- created: string (nullable = true)
 |-- number_of_pages: long (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- schema: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- num_refs: integer (nullable = false)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [8]:
short_papers.select("keywords").show(15, truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|keywords                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [9]:
short_papers.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+
|Charmonia product...|In this paper, pr...|2021-03-26T00:00:...|             15|[{PACS, null, 14....|      35|[Luchinsky, A.V.,...|
|Exclusive decays ...|Exclusive decays ...|2020-07-09T00:00:...|             10|[{null, publisher...|      42|[Luchinsky, A.V.,...|
|Doubly heavy bary...|The theoretical a...|2019-12-11T00:00:...|              8|[{INSPIRE, null, ...|      40|[Berezhnoy, A.V.,...|
|Weak decays of do...|We consider exclu...|2019-05-29T00:00:...|             10|[{null, publisher...|      21|[Gerasimov, A.S.,...|
|$B_c$ excitations...|Status of the Bc ...|2019-11-21T00:00:...|            

# Add an ID to Each Unique Paper in our dataset

In [10]:
#Adding an ID to each paper so the abstract data analysis can be attributed to a paper
from pyspark.sql.functions import monotonically_increasing_id
papersWIDs = short_papers.withColumn("id", monotonically_increasing_id())

In [11]:
papersWIDs.show()

+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|               title|            abstract|             created|number_of_pages|            keywords|num_refs|             authors| id|
+--------------------+--------------------+--------------------+---------------+--------------------+--------+--------------------+---+
|Charmonia product...|In this paper, pr...|2021-03-26T00:00:...|             15|[{PACS, null, 14....|      35|[Luchinsky, A.V.,...|  0|
|Exclusive decays ...|Exclusive decays ...|2020-07-09T00:00:...|             10|[{null, publisher...|      42|[Luchinsky, A.V.,...|  1|
|Doubly heavy bary...|The theoretical a...|2019-12-11T00:00:...|              8|[{INSPIRE, null, ...|      40|[Berezhnoy, A.V.,...|  2|
|Weak decays of do...|We consider exclu...|2019-05-29T00:00:...|             10|[{null, publisher...|      21|[Gerasimov, A.S.,...|  3|
|$B_c$ excitations...|Status of the Bc ...|2019-

# Data Manipulation - Create a dataframe with abstract word counts

In [11]:
#drop papers with no abstracts
papersWIDs = papersWIDs.dropna()

In [28]:
#Creates a new DF with the papers ID, and each word in the abstract alongside its position in the abstract
words = papersWIDs.select(
        "id",
        F.split("abstract", " ").alias("abstractWords"),
        F.posexplode(F.split("abstract", " ")).alias("position", "word")
    )
words.show(50)

+---+--------------------+--------+-------------+
| id|       abstractWords|position|         word|
+---+--------------------+--------+-------------+
|  0|[In, this, work,,...|       0|           In|
|  0|[In, this, work,,...|       1|         this|
|  0|[In, this, work,,...|       2|        work,|
|  0|[In, this, work,,...|       3|           by|
|  0|[In, this, work,,...|       4|        using|
|  0|[In, this, work,,...|       5|          the|
|  0|[In, this, work,,...|       6|      machine|
|  0|[In, this, work,,...|       7|     learning|
|  0|[In, this, work,,...|       8|     methods,|
|  0|[In, this, work,,...|       9|           we|
|  0|[In, this, work,,...|      10|        study|
|  0|[In, this, work,,...|      11|          the|
|  0|[In, this, work,,...|      12|sensitivities|
|  0|[In, this, work,,...|      13|           of|
|  0|[In, this, work,,...|      14|        heavy|
|  0|[In, this, work,,...|      15| pseudo-Dirac|
|  0|[In, this, work,,...|      16|     neutrino|


In [29]:
#pair each paper with their word counts
abs_word_count = words.select('id', 'word').groupBy('id','word').agg({"word": "count"})
abs_word_count.sort(col("id"),col("count(word)")).show()

+---+-------------+-----------+
| id|         word|count(word)|
+---+-------------+-----------+
|  0|   Perceptron|          1|
|  0|         this|          1|
|  0|       signal|          1|
|  0|      events.|          1|
|  0|  observables|          1|
|  0|     separate|          1|
|  0|           27|          1|
|  0|        found|          1|
|  0|           1$|          1|
|  0|      missing|          1|
|  0|   transverse|          1|
|  0|           In|          1|
|  0|         from|          1|
|  0|        work,|          1|
|  0|        (with|          1|
|  0|        boson|          1|
|  0|      charged|          1|
|  0|sensitivities|          1|
|  0|           or|          1|
|  0|          100|          1|
+---+-------------+-----------+
only showing top 20 rows



In [30]:
#Drop rows that have words less than 4 characters long or that contain dollar signs
filtered = abs_word_count.filter(length(abs_word_count.word)>4)
filtered = filtered.filter(~filtered.word.contains('$'))
filtered = filtered.filter(~filtered.word.contains('^'))
filtered = filtered.filter(~filtered.word.contains('}'))
filtered.sort(col("id"),col("count(word)").desc()).show(100)

+---+-----------------+-----------+
| id|             word|count(word)|
+---+-----------------+-----------+
|  0|         neutrino|          4|
|  0|            heavy|          3|
|  0|           hadron|          2|
|  0|signal/background|          2|
|  0|          machine|          2|
|  0|            using|          2|
|  0|         learning|          2|
|  0|       production|          1|
|  0|         methods,|          1|
|  0|           mixing|          1|
|  0|            boson|          1|
|  0|          Boosted|          1|
|  0|         separate|          1|
|  0|          events.|          1|
|  0|           signal|          1|
|  0|          missing|          1|
|  0|            while|          1|
|  0|            study|          1|
|  0|          analyze|          1|
|  0|         Gradient|          1|
|  0|        prospects|          1|
|  0|      high-energy|          1|
|  0|        kinematic|          1|
|  0|    reconstructed|          1|
|  0|        colliders|     

In [31]:
filtered = filtered.select("id", "count(word)", F.translate(F.col("word"), ".,", "").alias("replaced"))
filtered.sort(col("id"),col("count(word)").desc()).show(100)

+---+-----------+-----------------+
| id|count(word)|         replaced|
+---+-----------+-----------------+
|  0|          4|         neutrino|
|  0|          3|            heavy|
|  0|          2|           hadron|
|  0|          2|signal/background|
|  0|          2|          machine|
|  0|          2|            using|
|  0|          2|         learning|
|  0|          1|       production|
|  0|          1|          methods|
|  0|          1|           mixing|
|  0|          1|            boson|
|  0|          1|          Boosted|
|  0|          1|         separate|
|  0|          1|           events|
|  0|          1|           signal|
|  0|          1|          missing|
|  0|          1|            while|
|  0|          1|            study|
|  0|          1|          analyze|
|  0|          1|         Gradient|
|  0|          1|        prospects|
|  0|          1|      high-energy|
|  0|          1|        kinematic|
|  0|          1|    reconstructed|
|  0|          1|        col

# Data Manipulation - Find a classification of the paper 

Each paper has a classification in their keyword. In order to create a model that is predictive, we will need to train a model to classify. If my understanding is correct, the first keyword is probably the most important classifier.

In [63]:
#I don't know that I'm going to use these title word splits but just in case I'm going to leave this here.
keyWords = papersWIDs.select(
        "id", "number_of_pages", "authors", "keywords",
        F.split("title", " ").alias("titleWords"),
        F.posexplode(F.split("title", " ")).alias("position", "word")
    )
keyWords.show(50)

+---+---------------+--------------------+--------------------+--------------------+--------+---------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|           word|
+---+---------------+--------------------+--------------------+--------------------+--------+---------------+
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       0|      Improving|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       1|          heavy|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       2|          Dirac|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       3|       neutrino|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       4|      prospects|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       5|             at|
|  0|     

In [61]:
#creates a new column that
keyWords = keyWords.withColumn("new", keyWords["keywords"].getItem(0))
keyWords.show()

+---+---------------+--------------------+--------------------+--------------------+--------+-------------+--------------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|         word|                 new|
+---+---------------+--------------------+--------------------+--------------------+--------+-------------+--------------------+
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       0|    Improving|{INSPIRE, classif...|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       1|        heavy|{INSPIRE, classif...|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       2|        Dirac|{INSPIRE, classif...|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       3|     neutrino|{INSPIRE, classif...|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       4|    

In [62]:
classification =  keyWords.withColumn("classification", keyWords["new"].getItem("value"))
classification = classification.drop(col("new"))
classification.show()

+---+---------------+--------------------+--------------------+--------------------+--------+-------------+---------------+
| id|number_of_pages|             authors|            keywords|          titleWords|position|         word| classification|
+---+---------------+--------------------+--------------------+--------------------+--------+-------------+---------------+
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       0|    Improving| neutrino, mass|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       1|        heavy| neutrino, mass|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       2|        Dirac| neutrino, mass|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       3|     neutrino| neutrino, mass|
|  0|             26|[Feng, Jie, Li, M...|[{INSPIRE, classi...|[Improving, heavy...|       4|    prospects| neutrino, mass|
|  0|   