In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, CountVectorizer,
    IDF, ChiSqSelector
)
from pyspark.sql.functions import col, lower

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewsTFIDF") \
    .getOrCreate()


SLF4J: Class path contains multiple SLF4J bindings.

25/05/09 19:28:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Load the dataset
input_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(input_path)

# Lowercase the text
df = df.withColumn("reviewTextLower", lower(col("reviewText")))
df = df.withColumn("label", col("overall"))  # Or use a constant like lit(0.0) if labels are not yet meaningful

In [3]:
df

DataFrame[asin: string, category: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint, reviewTextLower: string, label: double]

In [4]:
# Tokenizer: split on whitespace, punctuation, digits, etc.
tokenizer = RegexTokenizer(
    inputCol="reviewTextLower",
    outputCol="tokens",
    pattern = r"""[\s\d()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",
    toLowercase=True
)

# Stopword remover
remover = StopWordsRemover(
    inputCol="tokens",
    outputCol="filtered_tokens"
)

# CountVectorizer: builds vocabulary from corpus
vectorizer = CountVectorizer(
    inputCol="filtered_tokens",
    outputCol="raw_features"
)

# TF-IDF
idf = IDF(
    inputCol="raw_features",
    outputCol="features",
    minDocFreq=5
)

# Chi-Square selector
selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="features",
    outputCol="selected_features",
    labelCol="label"
)

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, selector])

In [5]:
# Fit the pipeline
model = pipeline.fit(df)

# Extract vocabulary and top terms
cv_model = model.stages[2]  # CountVectorizerModel
selector_model = model.stages[4]  # ChiSqSelectorModel

vocab = cv_model.vocabulary
top_indices = selector_model.selectedFeatures
# Map indices to terms
top_terms = [vocab[i] for i in top_indices]

In [6]:
# Save top terms to output file
output_path = "output_ds.txt"  # Or to HDFS if needed
with open(output_path, "w") as f:
    for term in top_terms:
        f.write(term + "\n")

print(f"Top 2000 terms saved to {output_path}")

# Stop the Spark session
spark.stop()


Top 2000 terms saved to output_ds.txt
