In [None]:
# SparkContext represents the connection to a Spark cluster
from pyspark.context import SparkContext
# Configuration for a Spark application
from pyspark.conf import SparkConf
# The entry point to programming Spark with the Dataset and DataFrame API
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

conf = SparkConf().setAppName("Project_session_3_SparkML")
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [None]:
!unzip -u dblp.v10.zip

In [None]:
df = spark.read.json("./dblp-ref/*.json", multiLine=True)

In [None]:
df.printSchema()   # Examine the structure of the data
df.show(5)         # Display a few sample rows

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, LongType

# Correctly define the schema
schema = StructType([
    StructField("abstract", StringType(), True),
    StructField("authors", ArrayType(StringType()), True),
    StructField("id", StringType(), True),
    StructField("n_citation", LongType(), True),
    StructField("references", ArrayType(StringType()), True),
    StructField("title", StringType(), True),
    StructField("venue", StringType(), True),
    StructField("year", LongType(), True)
])

# Read the data with the schema
df = spark.read.json("./dblp-ref/*.json", multiLine=True, schema=schema)


In [None]:
df.printSchema()  # Display the schema to verify it's correct

In [None]:
df.show(5)

In [None]:
from pyspark.sql.functions import col
# from langdetect import detect

# Filter for English documents
df_english = df.filter(col("title") == "en") 

# Basic data exploration
df_english.describe(["year", "n_citation"]).show()


In [None]:
type(df)

In [None]:
#Spark dataframes can be interoperable with pandas too
df.limit(10).toPandas()

In [None]:
# # Define a function to detect language (you might want to optimize this for Spark)
# def detect_language(text):
#     try:
#         return detect(text)
#     except:
#         return "unknown"

# # Register the function as a Spark UDF
# from pyspark.sql.functions import udf
# detect_language_udf = udf(detect_language, StringType())

# # Filter for English documents based on either title or abstract
# df_english = df.filter(
#     (detect_language_udf(col("title")) == "en") | (detect_language_udf(col("abstract")) == "en")
# )

# # Optionally, drop the now-unnecessary "value" column
# #df_english = df_english.drop("value")

# # Display some sample English documents
# df_english.show(5, truncate=False)

In [None]:
df.select("title").show(n=5, truncate=False)

In [None]:
#Preprocessing

In [None]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.sql.functions import concat_ws

# 1. Tokenization (Split into Words)
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
wordsData = tokenizer.transform(df_english)

# 2. Remove Stop Words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = stopwordsRemover.transform(wordsData)

# 3. Remove Custom Stop Words
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']
stopwordsRemover_custom = StopWordsRemover(inputCol="filtered", outputCol="filtered_custom", stopWords=custom_stop_words)
filteredData_custom = stopwordsRemover_custom.transform(filteredData)

# 4. Remove Punctuation (Using Regex)
removePunctuation = regexp_replace(concat_ws(" ", col("filtered_custom")), r'[!()-[]{};:\'"\\,<>./?@#$%^&*_~]', " ")
filteredData_custom = filteredData_custom.withColumn("filtered_no_punc", removePunctuation)

# 5. Lowercase Conversion
lowercase = lower(col("filtered_no_punc"))
filteredData_custom = filteredData_custom.withColumn("filtered_lower", lowercase)

# 6. Select relevant columns for further processing.
preprocessed_df = filteredData_custom.select("id", "title", "filtered_lower")

# Display the first 5 preprocessed abstracts
preprocessed_df.show(5, truncate=False)


In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# 1. Tokenization (If not done in the preprocessing stage)
tokenizer = Tokenizer(inputCol="filtered_lower", outputCol="words")
wordsData = tokenizer.transform(preprocessed_df)

# 2. Calculate Term Frequencies (TF)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)  # Choose a suitable numFeatures
featurizedData = hashingTF.transform(wordsData)

# 3. Calculate Inverse Document Frequencies (IDF)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
