# 1. Setup & Configuration

In [41]:
# 1. Imports and SparkContext setup
import json
import logging
from pyspark.sql import SparkSession

# If using a plain Python 3 kernel, bootstrap Spark; 
# if you’re on the “PySpark Python 3” kernel, spark/sc are already available.
spark = SparkSession.builder.appName("Part1_ChiSquare").getOrCreate()
sc = spark.sparkContext

# 2. Logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("DIC25_Part1")

# 3. Relative paths (project root)
REVIEWS_DEVSET_PATH   = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
STOPWORDS_PATH = "stopwords.txt"
OUTPUT_PATH    = "output_rdd.txt"

In [42]:
import os

print("CWD:", os.getcwd())
print("Here’s what lives in this folder:")
for fn in sorted(os.listdir(".")):
    print(" ", fn)


CWD: /home/e12433762/DIC2025_Ex2
Here’s what lives in this folder:
  .ipynb_checkpoints
  reviews_devset.json
  src
  stopwords.txt


In [43]:
# Load the provided stopword list into a Python set
def load_stopwords(path):
    with open(path, encoding="utf-8") as f:
        return set(w.strip() for w in f if w.strip())

stopwords = load_stopwords(STOPWORDS_PATH)
log.info("Loaded %d stopwords", len(stopwords))

INFO:DIC25_Part1:Loaded 591 stopwords


In [48]:
def get_reviews_rdd(path: str):
    """
    Read a JSON-Lines file into a DataFrame,
    select category & reviewText, drop nulls,
    then convert to RDD[(category, text)].
    """
    df = (
        spark.read           # ← default: each line is one JSON record
             .json(path)     # no .option("multiline","true")
             .select("category", "reviewText")
             .na.drop(subset=["category", "reviewText"])
    )

    return df.rdd.map(lambda row: (row["category"], row["reviewText"]))

reviews = get_reviews_rdd(REVIEWS_DEVSET_PATH)
log.info("Loaded %d reviews", reviews.count())

INFO:DIC25_Part1:Loaded 78829 reviews


# 2.Preprocessing Functions & RDD Creation

In [49]:
import re

# 3.1 Define the exact delimiters from preprocessing.py
DELIMITERS = (
    r'[\s\t\d\(\)\[\]\{\}\.!\?,;:+=\-_"]'
    r"|\'|`|~|#|@|&|%|\*|\\/|\u20AC|\$|\u00A7"
)
token_split_re = re.compile(DELIMITERS)

# 3.2 Preprocessing function
def preprocess_record(record):
    """
    Input:  (category, reviewText)
    Output: (category, [filtered tokens])
    """
    category, text = record
    # lowercase + split on all delimiters
    tokens = token_split_re.split(text.lower())
    # drop empty strings, stopwords, and tokens of length <= 1
    filtered = [t for t in tokens if t and t not in stopwords and len(t) > 1]
    return category, filtered

# 3.3 Apply to the reviews RDD
cat_tokens = reviews.map(preprocess_record)

# 3.4 Quick check & logging
log.info("Preprocessed %d reviews", cat_tokens.count())
for cat, toks in cat_tokens.take(5):
    log.info("%s → %d tokens", cat, len(toks))


INFO:DIC25_Part1:Preprocessed 78829 reviews
INFO:DIC25_Part1:Patio_Lawn_and_Garde → 31 tokens
INFO:DIC25_Part1:Patio_Lawn_and_Garde → 34 tokens
INFO:DIC25_Part1:Patio_Lawn_and_Garde → 32 tokens
INFO:DIC25_Part1:Patio_Lawn_and_Garde → 21 tokens
INFO:DIC25_Part1:Patio_Lawn_and_Garde → 16 tokens


# 4. Token‐Category Counts with Spark

In [50]:
from collections import Counter

def map_record_to_counts(records):
    """
    Mapper‐style local aggregation:
      Input:  iterator over (category, tokens) pairs
      Output: ((token, category), count) for each token in each record
    """
    for category, tokens in records:
        # local Counter to collapse duplicate tokens within this record
        local_counts = Counter((token, category) for token in tokens)
        for tok_cat, cnt in local_counts.items():
            yield tok_cat, cnt

# Compute ((token, category), total_count) across the corpus
token_cat_counts = (
    cat_tokens
      .mapPartitions(map_record_to_counts)       # local pre‐aggregation
      .reduceByKey(lambda a, b: a + b)          # global aggregation
)

# Inspect a sample of results
print("Sample token-category counts:")
for (token, category), count in token_cat_counts.take(10):
    print(f"{token!r} in {category!r} → {count}")

# (Optional) Persist or save results for later stages:
# token_cat_counts.saveAsTextFile("output/token_category_counts")

Sample token-category counts:
'amp' in 'CDs_and_Vinyl' → 93
'glad' in 'CDs_and_Vinyl' → 94
'studio' in 'CDs_and_Vinyl' → 104
'radio' in 'CDs_and_Vinyl' → 185
'recordings' in 'CDs_and_Vinyl' → 207
'section' in 'CDs_and_Vinyl' → 60
'shaw' in 'CDs_and_Vinyl' → 4
'cole' in 'CDs_and_Vinyl' → 22
'gordon' in 'CDs_and_Vinyl' → 22
'johnny' in 'CDs_and_Vinyl' → 36


# 5. Category Document Counts with Spark

In [52]:
# Given `reviews: RDD[(category, reviewText)]`, count how many reviews per category:
category_counts = (
    reviews
      .map(lambda cat_text: (cat_text[0], 1))       # emit (category, 1) per record
      .reduceByKey(lambda a, b: a + b)             # sum up per category
)

# Inspect the counts
print("Number of reviews per category:")
for category, count in category_counts.collect():
    print(f"{category!r}: {count}")

Number of reviews per category:
'Kindle_Store': 3205
'Electronic': 7825
'Movies_and_TV': 4607
'Tools_and_Home_Improvement': 1926
'Grocery_and_Gourmet_Food': 1297
'Apps_for_Android': 2638
'Book': 22507
'Toys_and_Game': 2253
'Office_Product': 1243
'Digital_Music': 836
'Automotive': 1374
'Beauty': 2023
'Patio_Lawn_and_Garde': 994
'Sports_and_Outdoor': 3269
'Musical_Instrument': 500
'CDs_and_Vinyl': 3749
'Clothing_Shoes_and_Jewelry': 5749
'Home_and_Kitche': 4254
'Cell_Phones_and_Accessorie': 3447
'Pet_Supplie': 1235
'Baby': 916
'Health_and_Personal_Care': 2982


# 6. Total Term Counts with Spark

In [53]:
from collections import Counter
from operator import add

def map_tokens_to_counts(records):
    """
    Local aggregation of token counts within each partition:
      Input:  iterator over (category, tokens) pairs
      Output: (token, count) for all tokens in that partition
    """
    # accumulate in a single Counter per partition
    partition_counter = Counter()
    for _, tokens in records:
        partition_counter.update(tokens)
    # emit partial sums
    for token, cnt in partition_counter.items():
        yield token, cnt

# 1) Local-combined counts per partition
partial_counts = cat_tokens.mapPartitions(map_tokens_to_counts)

# 2) Global aggregation like the combiner+reducer in MRTotalTermCounts
total_term_counts = partial_counts.reduceByKey(add)

# 3) Inspect a sample
print("Sample total term counts:")
for token, count in total_term_counts.take(10):
    print(f"{token!r} → {count}")


Sample total term counts:
'gift' → 1795
'husband' → 1564
'making' → 2070
'things' → 4269
'love' → 15659
'directions' → 429
'simple' → 1927
'make' → 7180
'raichlen' → 2
'recipes' → 933


# 7. Chi-Square Calculation & Top-K Terms per Category in Spark

In [54]:
def compute_chi2_top_terms(token_cat_counts_rdd, 
                           category_counts_rdd, 
                           total_term_counts_rdd, 
                           top_k=75):
    """
    Given:
      token_cat_counts_rdd: RDD[((token, category), A)]                # A = total times token appears in category
      category_counts_rdd:   RDD[(category, N_i)]                      # N_i = number of reviews in category
      total_term_counts_rdd: RDD[(token, n_j)]                         # n_j = total times token appears overall
    Returns:
      RDD[(category, "term1:chi2 term2:chi2 ...")] with top_k terms per category.
    """
    # 1) collect & broadcast the small side-tables
    category_reviews = category_counts_rdd.collectAsMap()  # {category: N_i}
    N = sum(category_reviews.values())                     # total number of reviews
    bc_cat_rev = sc.broadcast(category_reviews)
    
    term_totals = total_term_counts_rdd.collectAsMap()     # {term: n_j}
    bc_term_tot = sc.broadcast(term_totals)
    
    # 2) compute χ² for each ((term, category), A)
    def compute_chi2(kv):
        (term, category), A = kv
        N_i = bc_cat_rev.value.get(category, 0)
        n_j = bc_term_tot.value.get(term, 0)
        B = N_i - A
        C = n_j - A
        D = N - A - B - C
        num = (A * D - B * C) ** 2
        den = (A + B) * (C + D) * (A + C) * (B + D)
        chi2 = (N * num / den) if den != 0 else 0.0
        return category, (term, chi2)
    
    chi2_rdd = token_cat_counts_rdd.map(compute_chi2)
    
    # 3) group by category and pick top_k by χ²
    top_rdd = (
        chi2_rdd
          .groupByKey()
          .mapValues(lambda seq: sorted(seq, key=lambda x: -x[1])[:top_k])
          .mapValues(lambda top: " ".join(f"{t}:{c:.4f}" for t, c in top))
    )
    
    return top_rdd

# ───────── Usage ─────────

chi2_top75 = compute_chi2_top_terms(
    token_cat_counts,   # from step 4
    category_counts,    # from step 5
    total_term_counts,  # from step 7
    top_k=75
)

print("Sample χ² top-75 terms for a few categories:")
for category, term_list in chi2_top75.take(5):
    print(f"{category} → {term_list}")

Sample χ² top-75 terms for a few categories:
Kindle_Store → author:1325.7007 characters:1301.9683 series:896.3881 reading:704.1122 kindle:675.0471 romance:667.6760 short:645.0113 stories:573.5698 novella:539.4399 enjoyed:510.1735 cherri:401.2123 written:360.9085 loved:325.4744 alex:310.9019 wait:306.3657 ebook:288.4289 sex:287.0325 monash:283.1907 jae:283.0291 plot:269.1577 kade:260.3645 cernaq:259.5882 laryssa:259.5882 character:245.7995 kayla:244.4273 steamy:242.2387 forward:241.2606 mira:240.7279 persey:235.9863 love:218.9271 heroine:217.4329 garret:216.2850 leofgar:212.3849 tale:211.8060 relationship:207.5869 dani:204.0791 writing:200.5018 errors:195.8583 quality:195.0736 livie:189.3529 sophi:188.7842 sedora:188.7842 skeen:188.7842 wulfstan:188.7842 gabrielle:184.5518 bought:182.6530 aidan:181.2628 sexy:180.6496 hassel:171.6680 connor:170.1634 casey:168.0664 leopold:166.0370 sarina:165.1841 price:160.7013 naomi:160.4649 talbot:160.4649 zoe:160.2674 music:156.2964 braden:150.2092 zo

# 8. Merge chi-squared outputs

In [55]:
# Assume `chi2_top75` is an RDD[(category, term_list_str)]
# 1) Save the RDD as a single text file locally:
lines = chi2_top75.map(lambda kv: f"{json.dumps(kv[0])}\t{kv[1]}")
lines.coalesce(1).saveAsTextFile("output/merged_chi2_output.txt")

# 9. Final Formatting in Spark

In [56]:
# 1) Collect and sort the (category, term:χ²…) lines
cat_lines = chi2_top75.collect()  # e.g. [("Books", "author:123.4 ..."), ...]
cat_lines_sorted = sorted(cat_lines, key=lambda kv: kv[0])

# 2) Compute the set of all unique tokens
unique_terms = {
    term_score.split(":", 1)[0]
    for _, term_list in cat_lines_sorted
    for term_score in term_list.split()
}
unique_terms_sorted = sorted(unique_terms)

# 3) Write out exactly the same format as final_output.py
import os
os.makedirs("output", exist_ok=True)
with open("output/output.txt", "w", encoding="utf-8") as fout:
    # one category per line
    for category, term_list in cat_lines_sorted:
        fout.write(f"{category} {term_list}\n")
    # final line: all unique terms sorted alphabetically
    fout.write(" ".join(unique_terms_sorted) + "\n")

print("Wrote final formatted output to output/output.txt")


Wrote final formatted output to output/output.txt
