In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, StopWordsRemover, HashingTF, IDF, ChiSqSelector, StringIndexer
from pyspark.sql.functions import col

In [2]:
partial_dataset = "/user/dic24_shared/amazon-reviews/full/reviews_devset.json"
full_dataset = "/user/dic24_shared/amazon-reviews/full/reviewscombined.json"


In [3]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("TFIDF_ChiSq_Selection") \
    .getOrCreate()

try:
    df = spark.read.json(partial_dataset)
    print("File read successfully.")
    
    # Show the schema and some data
    df.printSchema()
    df.show(5)
    
except Exception as e:
    print(f"Error: {e}")

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
24/05/24 13:22:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/24 13:22:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/05/24 13:22:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/05/24 13:22:46 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/05/24 13:22:46 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/05/24 13:22:4

File read successfully.
root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)

+----------+--------------------+-------+-------+--------------------+-----------+--------------+--------------------+-------------------+--------------+
|      asin|            category|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|            summary|unixReviewTime|
+----------+--------------------+-------+-------+--------------------+-----------+--------------+--------------------+-------------------+--------------+
|0981850006|Patio_Lawn_and_Garde| [6, 7]|    5.0|This wa

In [4]:
# Step 1: Tokenization
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=r'\s+|\t+|\d+|[(){}.!?,;:+=-_"\`~#@&*%€$§\\/]+', gaps=True)
tokenized = tokenizer.transform(df)

In [5]:
# Step 2: Case Folding and Stopword Removal
# Convert all text to lower case
tokenized = tokenized.withColumn("words", col("words").cast("array<string>").alias("words"))

# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered = stopwords_remover.transform(tokenized)


In [6]:
# Step 3: TF-IDF Calculation
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures", vocabSize=20000)
vectorizer_model = vectorizer.fit(filtered)
featurizedData = vectorizer_model.transform(filtered)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


                                                                                

In [7]:
# Step 4: Convert Category to Numeric
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexedData = indexer.fit(rescaledData).transform(rescaledData)

                                                                                

In [8]:
# Function to calculate top 75 discriminative terms for each category
def get_top_terms_for_category(category_index):
    category_data = indexedData.filter(indexedData.categoryIndex == category_index)
    selector = ChiSqSelector(numTopFeatures=75, featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex")
    model = selector.fit(category_data)

    selected_features = model.selectedFeatures
    vocabulary = vectorizer_model.vocabulary
    # chi_square_values = model.transform(category_data).select("selectedFeatures").first().selectedFeatures
    selected_terms = [(vocabulary[idx], chi_square_values[idx]) for idx in selected_features]
    selected_terms.sort(key=lambda x: x[1], reverse=True)

    return selected_terms

In [9]:
categories = df.select("category").distinct().collect()
category_index_map = {row["category"]: idx for idx, row in enumerate(sorted(categories, key=lambda x: x["category"]))}

# Generate output for each category
category_output = []

for category, index in category_index_map.items():
    top_terms = get_top_terms_for_category(index)
    category_terms = f"{category} " + " ".join([f"{term}:{value}" for term, value in top_terms])
    category_output.append(category_terms)

                                                                                

In [10]:
with open("output_ds.txt", "w") as f:
    for line in sorted(category_output):
        f.write(f"{line}\n")

    # Generate merged dictionary
    all_terms = sorted(set(term.split(':')[0] for line in category_output for term in line.split()[1:]))
    f.write(" ".join(all_terms) + "\n")

# Stop the Spark session
spark.stop()