In [1]:
# SparkContext represents the connection to a Spark cluster
from pyspark.context import SparkContext
# Configuration for a Spark application
from pyspark.conf import SparkConf
# The entry point to programming Spark with the Dataset and DataFrame API
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

conf = SparkConf().setAppName("Project_session_3_SparkML")
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 34122)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [2]:
!unzip -u dblp.v10.zip

Archive:  dblp.v10.zip


In [3]:
df = spark.read.json("./dblp-ref/*.json", multiLine=True)

In [4]:
df.printSchema()   # Examine the structure of the data
df.show(5)         # Display a few sample rows

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|
|In this paper, a ...

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, LongType

# Correctly define the schema
schema = StructType([
    StructField("abstract", StringType(), True),
    StructField("authors", ArrayType(StringType()), True),
    StructField("id", StringType(), True),
    StructField("n_citation", LongType(), True),
    StructField("references", ArrayType(StringType()), True),
    StructField("title", StringType(), True),
    StructField("venue", StringType(), True),
    StructField("year", LongType(), True)
])

# Read the data with the schema
df = spark.read.json("./dblp-ref/*.json", multiLine=True, schema=schema)


In [6]:
df.printSchema()  # Display the schema to verify it's correct

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



In [7]:
df.show(5)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|
|In this paper, a ...|[S. Ben Jabra, Ez...|4ab3735c-80f1-472...|        50|[09cb2d7d-47d1-4a...|A new approach of...|international sym...|2008|
|The purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|Preliminary Desig...|international con...|2013|
|AdaBoost algorith...|[Zheng Xu, Runbin...|001eef4f-1d00-4ae...|         0|[0a11984c-ab6e-4b...|A Heterogeneous S...|high performance ..

In [8]:
from pyspark.sql.functions import col
# from langdetect import detect

# Filter for English documents
df_english = df.filter(col("title") == "en") 

# Basic data exploration
df_english.describe(["year", "n_citation"]).show()


+-------+----+----------+
|summary|year|n_citation|
+-------+----+----------+
|  count|   0|         0|
|   mean|NULL|      NULL|
| stddev|NULL|      NULL|
|    min|NULL|      NULL|
|    max|NULL|      NULL|
+-------+----+----------+



In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
#Spark dataframes can be interoperable with pandas too
df.limit(10).toPandas()

Unnamed: 0,abstract,authors,id,n_citation,references,title,venue,year
0,Based on biological control strategy in pest m...,"[Guoping Pang, Lansun Chen]",4aa69add-3978-480b-a1c0-d99a83d7e324,8,"[04754a28-6bf4-4d5d-8e42-2677d8564cdc, 33a877a...",Dynamic analysis of a pest-epidemic model with...,Mathematics and Computers in Simulation,2008
1,"In this paper, a robust 3D triangular mesh wat...","[S. Ben Jabra, Ezzeddine Zagrouba]",4ab3735c-80f1-472d-b953-fa0557fed28b,50,"[09cb2d7d-47d1-4a85-bfe5-faa8221e644b, 10aa16d...",A new approach of 3D watermarking based on ima...,international symposium on computers and commu...,2008
2,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",00127ee2-cb05-48ce-bc49-9de556b93346,0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013
3,AdaBoost algorithm based on Haar-like features...,"[Zheng Xu, Runbin Shi, Zhihao Sun, Yaqi Li, Yu...",001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,0,"[0a11984c-ab6e-4b75-9291-e1b700c98d52, 1f4152a...",A Heterogeneous System for Real-Time Detection...,high performance computing and communications,2016


In [11]:
# # Define a function to detect language (you might want to optimize this for Spark)
# def detect_language(text):
#     try:
#         return detect(text)
#     except:
#         return "unknown"

# # Register the function as a Spark UDF
# from pyspark.sql.functions import udf
# detect_language_udf = udf(detect_language, StringType())

# # Filter for English documents based on either title or abstract
# df_english = df.filter(
#     (detect_language_udf(col("title")) == "en") | (detect_language_udf(col("abstract")) == "en")
# )

# # Optionally, drop the now-unnecessary "value" column
# #df_english = df_english.drop("value")

# # Display some sample English documents
# df_english.show(5, truncate=False)

+--------+-------+---+----------+----------+-----+-----+----+
|abstract|authors|id |n_citation|references|title|venue|year|
+--------+-------+---+----------+----------+-----+-----+----+
+--------+-------+---+----------+----------+-----+-----+----+



In [17]:
df.select("title").show(n=5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Dynamic analysis of a pest-epidemic model with impulsive control                                                                                               |
|A new approach of 3D watermarking based on image segmentation                                                                                                  |
|Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map|
|A Heterogeneous System for 

In [21]:
#Preprocessing

In [23]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.sql.functions import concat_ws

# 1. Tokenization (Split into Words)
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
wordsData = tokenizer.transform(df_english)

# 2. Remove Stop Words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = stopwordsRemover.transform(wordsData)

# 3. Remove Custom Stop Words
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']
stopwordsRemover_custom = StopWordsRemover(inputCol="filtered", outputCol="filtered_custom", stopWords=custom_stop_words)
filteredData_custom = stopwordsRemover_custom.transform(filteredData)

# 4. Remove Punctuation (Using Regex)
removePunctuation = regexp_replace(concat_ws(" ", col("filtered_custom")), r'[!()-[]{};:\'"\\,<>./?@#$%^&*_~]', " ")
filteredData_custom = filteredData_custom.withColumn("filtered_no_punc", removePunctuation)

# 5. Lowercase Conversion
lowercase = lower(col("filtered_no_punc"))
filteredData_custom = filteredData_custom.withColumn("filtered_lower", lowercase)

# 6. Select relevant columns for further processing.
preprocessed_df = filteredData_custom.select("id", "title", "filtered_lower")

# Display the first 5 preprocessed abstracts
preprocessed_df.show(5, truncate=False)


+---+-----+--------------+
|id |title|filtered_lower|
+---+-----+--------------+
+---+-----+--------------+



In [24]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# 1. Tokenization (If not done in the preprocessing stage)
tokenizer = Tokenizer(inputCol="filtered_lower", outputCol="words")
wordsData = tokenizer.transform(preprocessed_df)

# 2. Calculate Term Frequencies (TF)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)  # Choose a suitable numFeatures
featurizedData = hashingTF.transform(wordsData)

# 3. Calculate Inverse Document Frequencies (IDF)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


Py4JJavaError: An error occurred while calling o275.fit.
: java.lang.IllegalStateException: Haven't seen any document yet.
	at org.apache.spark.mllib.feature.IDF$DocumentFrequencyAggregator.idf(IDF.scala:135)
	at org.apache.spark.mllib.feature.IDF.fit(IDF.scala:55)
	at org.apache.spark.ml.feature.IDF.fit(IDF.scala:93)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


TypeError: Invalid param value given for param "k". Could not convert 0.95 to int