In [1]:
import os
import sys
import re
import string
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import lower, col

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
def init_spark():
  spark = SparkSession.builder.appName("BigData").getOrCreate()
  sc = spark.sparkContext
  return spark,sc

## Preprocess dataset for training and testing.

In [None]:
def get_sample_from_file(file_name, output_file, sample_size = 10_000):
    content = ""
    with open(file_name, "r") as f:
        for i in range(sample_size):
            content += f.readline()
    with open(output_file, "w") as f:
        f.write(content)

In [3]:
# load data from json file with spark
def load_data_from_json(spark, file_name):
    return spark.read.json(file_name)

### Visualize dataset

In [4]:
sc = init_spark()[0]
arxiv_dataset = load_data_from_json(sc, "arxiv-sample.json")
# view first 5 rows
arxiv_dataset.take(5)

[Row(abstract='  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection 

## Clean dataset

### Searcing for missing values

In [136]:
# search for missing values
print("Number of missing values in abstract column: ", arxiv_dataset.filter(arxiv_dataset.abstract.isNull()).count())
print("Number of missing values in title column: ", arxiv_dataset.filter(arxiv_dataset.title.isNull()).count())
print("Number of missing values in categories column: ", arxiv_dataset.filter(arxiv_dataset.categories.isNull()).count())
print("Number of missing values in id column: ", arxiv_dataset.filter(arxiv_dataset.id.isNull()).count())
print("Number of missing values in submitter column: ", arxiv_dataset.filter(arxiv_dataset.submitter.isNull()).count())
print("Number of missing values in authors column: ", arxiv_dataset.filter(arxiv_dataset.authors.isNull()).count())
print("Number of missing values in report-no column: ",arxiv_dataset.filter(arxiv_dataset["report-no"].isNull()).count() )
print("Number of missing values in comments column: ", arxiv_dataset.filter(arxiv_dataset.comments.isNull()).count())
print("Number of missing values in doi column: ", arxiv_dataset.filter(arxiv_dataset.doi.isNull()).count())
print("Number of missing values in journal-ref column: ", arxiv_dataset.filter(arxiv_dataset["journal-ref"].isNull()).count())
print("Number of missing values in versions column: ", arxiv_dataset.filter(arxiv_dataset.versions.isNull()).count())


Number of missing values in abstract column:  0
Number of missing values in title column:  0
Number of missing values in categories column:  0
Number of missing values in id column:  0
Number of missing values in submitter column:  0
Number of missing values in authors column:  0
Number of missing values in report-no column:  9118
Number of missing values in comments column:  1138
Number of missing values in doi column:  3641
Number of missing values in journal-ref column:  4571
Number of missing values in versions column:  0


In [93]:
# show rows with empty fields
arxiv_dataset.filter(arxiv_dataset["abstract"] == "").show()
arxiv_dataset.filter(arxiv_dataset["title"] == "").show()
arxiv_dataset.filter(arxiv_dataset["authors"] == "").show()
arxiv_dataset.filter(arxiv_dataset["categories"] == "").show()

+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+---------+-----+-----------+--------+
|abstract|authors|authors_parsed|categories|comments|doi| id|journal-ref|license|report-no|submitter|title|update_date|versions|
+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+---------+-----+-----------+--------+
+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+---------+-----+-----------+--------+

+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+---------+-----+-----------+--------+
|abstract|authors|authors_parsed|categories|comments|doi| id|journal-ref|license|report-no|submitter|title|update_date|versions|
+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+---------+-----+-----------+--------+
+--------+-------+--------------+----------+--------+---+---+-----------+-------+---------+-----

### Basic preprocessing functions

In [145]:
# fill ALL NULL values with empty string : use after removing rows with empty fields
def fill_na_with_empty_string(df):
    return df.fillna("")

# remove rows with empty fields
def remove_empty_fields(df, field_name):
    return df.filter(df[field_name] != "")

# remove "\n" from text
def remove_empty_newlines(df, field_name):
    return df.withColumn(field_name, regexp_replace(col(field_name), "\n", " "))

# remove math formulas and latex
def remove_math_formula(df, field_name):
    return df.withColumn(field_name, regexp_replace(col(field_name), "\$.*?\$", ""))

# convert to lowercase
def convert_to_lowercase(df, field_name):
    return df.withColumn(field_name, lower(col(field_name)))

# remove extra spaces
def remove_extra_spaces(df, field_name):
    df = df.withColumn(field_name, regexp_replace(col(field_name), " +", " "))
    df = df.withColumn(field_name, regexp_replace(col(field_name), "^ +", ""))
    return df

# remove punctuation
def remove_punctuation(df, field_name):
    return df.withColumn(field_name, regexp_replace(col(field_name), "[^\w\s]", ""))


In [119]:
arxiv_dataset.select("abstract").show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|abstract                 

### Clear 'abstract' column

In [96]:
arxiv_dataset = remove_empty_fields(arxiv_dataset, "abstract")
arxiv_dataset = remove_empty_newlines(arxiv_dataset, "abstract")
# remove math equations from abstract ? 
# maybe we don't need to do this, because we can use the math equations for our model?
arxiv_dataset = remove_math_formula(arxiv_dataset, "abstract")
arxiv_dataset = convert_to_lowercase(arxiv_dataset, "abstract")
arxiv_dataset = remove_extra_spaces(arxiv_dataset, "abstract")

In [97]:
arxiv_dataset.select("abstract").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|abstract       

### Lemmatization (will follow)

## Convert Abstract to Vector Representation

In [23]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import Tokenizer

In [25]:
# create a Word2Vec model
input_col = "abstract"
output_col = "abstract_vector"

words2vec_model = Word2Vec(
    inputCol="words",
    outputCol=output_col,
    vectorSize=100,
    minCount=5
)
tokenized = Tokenizer(inputCol=input_col, outputCol="words")
tokenized_dataset = tokenized.transform(arxiv_dataset)
model = words2vec_model.fit(tokenized_dataset)
# show the vector of the first abstract


In [27]:
model.getVectors().show(truncate=False)

+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
# use tokenizer to split words
from pyspark.ml.feature import Tokenizer
# create a simple example dataframe with a single column called "text"
sc = init_spark()[0]
data = sc.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "text"])


# use tokenizer to split words
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized = tokenizer.transform(data)
# view the transformed column
tokenized.select("words").show(truncate=False)
# view data
# data.show(1)



+------------------------------------------+
|words                                     |
+------------------------------------------+
|[hi, i, heard, about, spark]              |
|[i, wish, java, could, use, case, classes]|
|[logistic,regression,models,are,neat]     |
+------------------------------------------+



!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
