In [32]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, expr
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

In [33]:
conf = SparkConf().setAppName("Part 1 RDD")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [34]:
#Load Data
file_path = "../data/reviews_devset.json"
reviews_df = spark.read.json(file_path)

# Load Stopwords
stopwords_path = "../data/stopwords.txt"
with open(stopwords_path, 'r') as file:
    stopwords = file.read().splitlines()

In [35]:
# Preprocess Data
# Get columns and case fold
reviews_df = reviews_df.select("category", "reviewText").withColumn("reviewText", lower(col("reviewText")))

# Tokenize Data
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=r'\s+|\t+|\d+|[(){}\[\].!?,;:+=\-_"\'`~#@&*%€$§\/]+')
reviews_df = tokenizer.transform(reviews_df)

# Remove Stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=stopwords)
reviews_df = remover.transform(reviews_df)

# Filter out single-character tokens
reviews_df = reviews_df.withColumn("filtered_words", expr("filter(filtered_words, x -> length(x) > 1)"))

# Select only relevant columns to continue
data = reviews_df.select("category", "filtered_words")

In [36]:
data.show()

sc.stop()

+--------------------+--------------------+
|            category|      filtered_words|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|[gift, husband, m...|
|Patio_Lawn_and_Garde|[nice, spreader, ...|
|Patio_Lawn_and_Garde|[metal, base, hos...|
|Patio_Lawn_and_Garde|[part, works, pre...|
|Patio_Lawn_and_Garde|[hose, supposed, ...|
|Patio_Lawn_and_Garde|[tool, works, cut...|
|Patio_Lawn_and_Garde|[typical, usable,...|
|Patio_Lawn_and_Garde|[excited, ditch, ...|
|Patio_Lawn_and_Garde|[purchased, leaf,...|
|Patio_Lawn_and_Garde|[manual, lawnmowe...|
|Patio_Lawn_and_Garde|[good, price, goo...|
|Patio_Lawn_and_Garde|[owned, flowtron,...|
|Patio_Lawn_and_Garde|[similar, family,...|
|Patio_Lawn_and_Garde|[birds, ate, blue...|
|Patio_Lawn_and_Garde|[bought, summer, ...|
|Patio_Lawn_and_Garde|[knew, mouse, bas...|
|Patio_Lawn_and_Garde|[worried, reading...|
|Patio_Lawn_and_Garde|[brand, long, tim...|
|Patio_Lawn_and_Garde|[current, model, ...|
|Patio_Lawn_and_Garde|[expected,