In [42]:
from pyspark import SparkContext, SparkConf
import json
import re

# Configuration for Spark
conf = SparkConf().setAppName("Part 1 RDD")
sc = SparkContext(conf=conf)

In [43]:
def preprocess(line):
    review = json.loads(line)
    category = review['category']
    review_text = review['reviewText']
    review_text = review_text.lower()
    words = re.split(r'\s+|\t+|\d+|[(){}\[\].!?,;:+=\-_"\'`~#@&*%€$§\/]+', review_text)
    filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
    filtered_words = list(set(filtered_words))
    return category, filtered_words

In [44]:
def calculate_chi_square(category, word, category_word_count, document_word_count, category_line_count, document_line_count):
    A = category_word_count
    B = document_word_count[word] - A
    C = category_line_count[category] - A
    D = document_line_count - category_line_count[category] - (document_word_count[word] - A)

    dividend = (A * D - B * C) ** 2
    divisor = (A + B) * (A + C) * (B + D) * (C + D)
    if divisor == 0:
        return 0
    else:
        return dividend / divisor

In [45]:
def get_dictionary(top_terms):
    dictionary = {} 
    for index, word, chi_square in top_terms:
        dictionary[word] = dictionary.get(word,0) + chi_square
    return ' '.join(sorted(dictionary.keys()))

In [46]:
def get_output(category_group):
    category, words = category_group
    sorted_words = sorted(words, key=lambda x: x[2], reverse=True)
    top_terms = sorted_words[:75]
    dictionary = get_dictionary(top_terms)
    
    output_terms = []
    for index, word, chi_square in top_terms:
        output_terms.append(f"{word}:{chi_square}")
        
    output_string = f"{category} {' '. join(output_terms)}"
    return dictionary, output_string

In [47]:
# Load Data as RDD
file_path = "../data/reviews_devset.json"
reviews = sc.textFile(file_path)

# Load Stopwords
stopwords_path = "../data/stopwords.txt"
with open(stopwords_path, 'r') as file:
    stopwords = file.read().splitlines()

In [48]:
# Preprocess Data
reviews_rdd = reviews.map(preprocess).cache()
reviews.unpersist()

../data/reviews_devset.json MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [49]:
# Total lines
document_line_count = reviews_rdd.count()

In [50]:
# Lines per category
category_line_count = reviews_rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda a, b: a + b)

In [51]:
# Transform the RDD to ((category, word), 1)
category_word_pairs = reviews_rdd.flatMap(lambda x: [((x[0], word), 1) for word in x[1]]).cache()
reviews_rdd.unpersist()

PythonRDD[2] at RDD at PythonRDD.scala:49

In [52]:
# Lines with word per category
category_word_count = category_word_pairs.reduceByKey(lambda a, b: a + b)

In [53]:
# Lines with word in document
document_word_count = category_word_pairs.map(lambda x: (x[0][1], 1)).reduceByKey(lambda a, b: a + b)

In [54]:
category_word_pairs.unpersist()

PythonRDD[8] at RDD at PythonRDD.scala:49

In [55]:
# Collecting data into local variables, so no RDDs inside of transformations
document_word_count_local = dict(document_word_count.collect())
category_line_count_local = dict(category_line_count.collect())

In [56]:
# Get Chi Square
chi_square_rdd = category_word_count.map(lambda cw: (
    cw[0][0],  # Category
    cw[0][1],  # Word
    calculate_chi_square(cw[0][0], cw[0][1], cw[1], document_word_count_local, category_line_count_local, document_line_count)
))

category_word_count.unpersist()
category_word_pairs.unpersist()
document_word_count.unpersist()
category_line_count.unpersist()

PythonRDD[18] at collect at <ipython-input-55-3d1492809c2e>:3

In [57]:
# Define output per category
category_rdd = chi_square_rdd.groupBy(lambda x: x[0])
complied_output = category_rdd.map(get_output).collect()

# Seperate Output
dictionary = [item[0] for item in complied_output]
categories = [item[1] for item in complied_output]

categories_sorted = sorted(categories, key=lambda x: x)

In [58]:
# Merge dictionary
merged_dictionary = sc.parallelize(dictionary)
unique_dictionary = merged_dictionary.flatMap(lambda x: x.split(" ")).distinct()
unique_dictionary = sorted(unique_dictionary.collect())

In [59]:
with open("output_rdd_devset.txt", "w") as output_file:
    for line in categories_sorted:
        output_file.write(line + "\n")

    output_file.write(" ".join(unique_dictionary) + "\n")

In [60]:
sc.stop()