In [1]:
from pyspark.sql import SparkSession
from pyspark.rdd import RDD
import re
import math
from collections import Counter


In [2]:
spark = SparkSession.builder.appName("DIC").getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.

25/05/10 22:05:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
df = spark.read.json("hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json")
reviews = df.rdd.filter(lambda row: row["reviewText"] is not None)

In [None]:
df.printSchema()
df.select("category", "reviewText").show(5)


In [None]:
stopwords = sc.broadcast(set(spark.sparkContext.textFile("stopwords.txt").collect()))
TOKEN_REGEX = re.compile(r"[\s\t\d()\[\]{}.!?,;:+=\-_\"'`~#@&*%€$§\\/]+")


def tokenize(text):
    return set(
        token for token in TOKEN_REGEX.split(text.lower())
        if len(token) > 1 and token not in stopwords.value
    )


In [None]:
# (1) (category, reviewText)
cat_text = reviews.map(lambda row: (row["category"], row["reviewText"]))

# (2) ((term, category), 1)
term_cat_pairs = cat_text.flatMap(lambda x: [((term, x[0]), 1) for term in tokenize(x[1])])

# (3) 
term_cat_counts = term_cat_pairs.reduceByKey(lambda a, b: a + b)

# (4) 
term_grouped = term_cat_counts.map(lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey()

# (5) 
docs_per_cat = cat_text.map(lambda x: (x[0], 1)).reduceByKey(lambda a, b: a + b)
total_docs = docs_per_cat.values().sum()

docs_per_cat_bc = sc.broadcast(dict(docs_per_cat.collect()))
total_docs_bc = sc.broadcast(total_docs)

# (6) 
global_term_counts = term_cat_pairs.map(lambda x: (x[0][0], 1)).reduceByKey(lambda a, b: a + b)
global_term_counts_bc = sc.broadcast(dict(global_term_counts.collect()))


In [None]:
# (7) 
def compute_chi(term, cat_counts_iter):
    cat_counts = dict(cat_counts_iter)
    df_t = global_term_counts_bc.value.get(term, 0)
    N = total_docs_bc.value
    results = []

    for cat in docs_per_cat_bc.value:
        A = float(cat_counts.get(cat, 0))
        B = df_t - A
        C = docs_per_cat_bc.value[cat] - A
        D = N - A - B - C
        denom = (A + C) * (B + D) * (A + B) * (C + D)
        chi2 = N * ((A * D - B * C) ** 2) / denom if denom > 0 else 0.0
        results.append(((term, cat), chi2))
    return results

chi_squares = term_grouped.flatMap(lambda x: compute_chi(x[0], x[1]))
chi_squares.take(10)

In [None]:
# (8)
by_category = chi_squares.map(lambda x: (x[0][1], (x[1], x[0][0])))
top75 = by_category.groupByKey().mapValues(
    lambda values: sorted(values, reverse=True)[:75]
).collect()

# (9)
formatted = []
for category, top_terms in sorted(top75):
    terms = " ".join([f"{term}:{score}" for score, term in top_terms])
    formatted.append(f"<{category}> {terms}")

# (10) 
all_terms = sorted(set(term for _, terms in top75 for _, term in terms))
print(all_terms[:10])

In [None]:
# (11) 
with open("output_rdd.txt", "w", encoding="utf-8") as f:
    for line in formatted:
        f.write(line + "\n")
    f.write(" ".join(all_terms))