In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, CountVectorizer,
    IDF, ChiSqSelector
)
from pyspark.sql.functions import col, lower

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewsTFIDF") \
    .getOrCreate()


SLF4J: Class path contains multiple SLF4J bindings.

25/05/09 19:28:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Load the dataset
input_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(input_path)

# Lowercase the text
df = df.withColumn("reviewTextLower", lower(col("reviewText")))
df = df.withColumn("label", col("overall"))  # Or use a constant like lit(0.0) if labels are not yet meaningful

In [3]:
df

DataFrame[asin: string, category: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint, reviewTextLower: string, label: double]

In [4]:
# Tokenizer: split on whitespace, punctuation, digits, etc.
tokenizer = RegexTokenizer(
    inputCol="reviewTextLower",
    outputCol="tokens",
    pattern = r"""[\s\d()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",
    toLowercase=True
)

# Stopword remover
remover = StopWordsRemover(
    inputCol="tokens",
    outputCol="filtered_tokens"
)

# CountVectorizer: builds vocabulary from corpus
vectorizer = CountVectorizer(
    inputCol="filtered_tokens",
    outputCol="raw_features"
)

# TF-IDF
idf = IDF(
    inputCol="raw_features",
    outputCol="features",
    minDocFreq=5
)

# Chi-Square selector
selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="features",
    outputCol="selected_features",
    labelCol="label"
)

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, selector])

In [5]:
# Fit the pipeline
model = pipeline.fit(df)

# Extract vocabulary and top terms
cv_model = model.stages[2]  # CountVectorizerModel
selector_model = model.stages[4]  # ChiSqSelectorModel

vocab = cv_model.vocabulary
top_indices = selector_model.selectedFeatures
# Map indices to terms
top_terms = [vocab[i] for i in top_indices]

In [6]:
# Save top terms to output file
output_path = "output_ds.txt"  # Or to HDFS if needed
with open(output_path, "w") as f:
    for term in top_terms:
        f.write(term + "\n")

print(f"Top 2000 terms saved to {output_path}")

# Stop the Spark session
spark.stop()


Top 2000 terms saved to output_ds.txt


# Part 3

## Split

In [None]:
#split the data into training (60%), validation (20%), and test (20%) sets according to the exercise description
train_val, test = df.randomSplit([0.8, 0.2], seed=42)
train, val = train_val.randomSplit([0.75, 0.25], seed=42)


## Classification pipeline and evaluation

In [None]:
#LinearSVC - SVM classifier with linear kernel
#OneVsRest - to handle multiclass classification by binary classifier
from pyspark.ml.classification import LinearSVC, OneVsRest

#to estimate performance using F1 as criterion
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#vector length normalization L2
from pyspark.ml.feature import Normalizer

In [None]:
#normalizer for L2 norm
normalizer = Normalizer(inputCol="selected_features", outputCol="norm_features", p=2.0)

#evaluator for F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")


## Grid search for parameter optimization

In [None]:
results = []

"""
Possible parameters for comparison (according to the exercise description):
- standardization of training features (2 values): True or False
- number of features: 2000 (given in the exercise), 500 (much havier filtering)
- regularization parameter (3 values): 0.1, 0.01 or 0.001
- maximum number of iterations (2 values): 10 or 50
"""

#iterating through the possible parameter combinations
for standardize in [True, False]:
    for num_features in [2000, 500]:
        selector.setNumTopFeatures(num_features)

        for reg_param in [0.1, 0.01, 0.001]:
            for max_iter in [10, 50]:
                #create the Linear SVM model with the current parameters and using the one vs all strategy
                svm = LinearSVC(featuresCol="norm_features", labelCol="label", regParam=reg_param, maxIter=max_iter, standardization=standardize)
                ovr = OneVsRest(classifier=svm, featuresCol="norm_features", labelCol="label")

                #adding the normalizer and the classifier to create full pipeline
                pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, selector, normalizer, ovr])

                #train on train set
                model = pipeline.fit(train)

                #evaluate on val set
                predictions = model.transform(val)
                f1 = evaluator.evaluate(predictions)

                #save results
                results.append({
                    "standardization": standardize,
                    "num_features": num_features,
                    "reg_param": reg_param,
                    "max_iter": max_iter,
                    "f1_score": f1
                })


## Grid search results - best parameters

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="f1_score", ascending=False)

print("Grid Search Results:")
print(results_df)

## Evaluate best model on test set

In [None]:
#best model on best parameters
best_params = results_df.iloc[0]
selector.setNumTopFeatures(best_params["num_features"])

svm = LinearSVC(featuresCol = "norm_features", labelCol = "label", regParam = best_params["reg_param"], maxIter = int(best_params["max_iter"]), standardization = best_params["standardization"])
ovr = OneVsRest(classifier = svm, featuresCol = "norm_features", labelCol = "label")

final_pipeline = Pipeline(stages = [tokenizer, remover, vectorizer, idf, selector, normalizer, ovr])
final_model = final_pipeline.fit(train)

#predict and evaluate on test set
test_predictions = final_model.transform(test)
f1_test = evaluator.evaluate(test_predictions)

print("Best configuration F1 on test set:", f1_test)
