In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, CountVectorizer,
    IDF, ChiSqSelector
)
from pyspark.sql.functions import col, lower

#initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewsTFIDF") \
    .getOrCreate()


SLF4J: Class path contains multiple SLF4J bindings.


25/05/10 19:25:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
input_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(input_path)

#lowercase to make fewer instances
df = df.withColumn("reviewTextLower", lower(col("reviewText")))
df = df.withColumn("label", col("overall"))  # Or use a constant like lit(0.0) if labels are not yet meaningful

In [3]:
df

DataFrame[asin: string, category: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint, reviewTextLower: string, label: double]

In [None]:
#tokenization step
tokenizer = RegexTokenizer(
    inputCol="reviewTextLower",
    outputCol="tokens",
    pattern = r"""[\s\d()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",
    toLowercase=True
)

#remove stopwords
remover = StopWordsRemover(
    inputCol="tokens",
    outputCol="filtered_tokens"
)

#vectorizer
vectorizer = CountVectorizer(
    inputCol="filtered_tokens",
    outputCol="raw_features"
)

#TF-IDF
idf = IDF(
    inputCol="raw_features",
    outputCol="features",
    minDocFreq=5
)

#Chi-Square
selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="features",
    outputCol="selected_features",
    labelCol="label"
)


pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, selector])

In [None]:
model = pipeline.fit(df)

#extracting vocabulary and top terms
cv_model = model.stages[2] 
selector_model = model.stages[4]  

vocab = cv_model.vocabulary
top_indices = selector_model.selectedFeatures

top_terms = [vocab[i] for i in top_indices]

In [None]:
#top 2000 terms to output file
output_path = "output_ds.txt"  
with open(output_path, "w") as f:
    for term in top_terms:
        f.write(term + "\n")

print(f"Top 2000 terms saved to {output_path}")

#stop the Spark session - not now, only at the end of Part 3
#spark.stop()


Top 2000 terms saved to output_ds.txt


In [7]:
print("hi")

hi


# Part 3

## Split

In [None]:
#I got the error in grid search, that at a point I have a split that only one class is present - so let's check how the class samples are originally and after splitting

from pyspark.sql.functions import count

#count reviews per category (i.e., label)
df.groupBy("label").agg(count("*").alias("count")).orderBy("count", ascending=False).show(50, truncate=False)

+-----+-----+
|label|count|
+-----+-----+
|5.0  |46957|
|4.0  |15239|
|3.0  |6644 |
|1.0  |6095 |
|2.0  |3894 |
+-----+-----+



In [None]:
from pyspark.ml.feature import StringIndexer

#indexing for OneVsRest
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)

df = indexer.transform(df)
selector.setParams(labelCol="indexedLabel")

#split the data into training (60%), validation (20%), and test (20%) sets according to the exercise description
train_val, test = df.randomSplit([0.8, 0.2], seed=42)
train, val = train_val.randomSplit([0.75, 0.25], seed=42)


In [None]:
#training set
print("Training set class distribution:")
train.groupBy("indexedLabel").agg(count("*").alias("count")).orderBy("count", ascending=False).show(50, truncate=False)

#validation set
print("Validation set class distribution:")
val.groupBy("indexedLabel").agg(count("*").alias("count")).orderBy("count", ascending=False).show(50, truncate=False)

#test set
print("Test set class distribution:")
test.groupBy("indexedLabel").agg(count("*").alias("count")).orderBy("count", ascending=False).show(50, truncate=False)

#sample sizes are enough, so we might have a problem with OneVsRest identifying the classes -> indexing might help

Training set class distribution:
+------------+-----+
|indexedLabel|count|
+------------+-----+
|0.0         |28299|
|1.0         |9188 |
|2.0         |4027 |
|3.0         |3650 |
|4.0         |2326 |
+------------+-----+

Validation set class distribution:
+------------+-----+
|indexedLabel|count|
+------------+-----+
|0.0         |9354 |
|1.0         |2982 |
|2.0         |1328 |
|3.0         |1232 |
|4.0         |779  |
+------------+-----+

Test set class distribution:
+------------+-----+
|indexedLabel|count|
+------------+-----+
|0.0         |9304 |
|1.0         |3069 |
|2.0         |1289 |
|3.0         |1213 |
|4.0         |789  |
+------------+-----+



## Classification pipeline and evaluation

In [None]:
#LinearSVC - SVM classifier with linear kernel
#OneVsRest - to handle multiclass classification by binary classifier
from pyspark.ml.classification import LinearSVC, OneVsRest

#to estimate performance using F1 as criterion
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#vector length normalization L2
from pyspark.ml.feature import Normalizer

In [None]:
#for make the code more efficient, we preprocess with TF-IDF once and cache
vectorizer.setParams(outputCol="raw_features", vocabSize=5000)
idf.setParams(inputCol="raw_features", outputCol="features", minDocFreq=5)

pre_pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf])
pre_model = pre_pipeline.fit(train)

pre_train = pre_model.transform(train).cache()
pre_val = pre_model.transform(val).cache()
pre_train.count()  #trigger caching
pre_val.count()

15675

In [None]:
#normalizer for L2 norm
normalizer = Normalizer(inputCol="selected_features", outputCol="norm_features", p=2.0)

#evaluator for F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")


## Grid search for parameter optimization

In [None]:
import time
import pandas as pd

results = []
total_runs = 2 * 2 * 3 * 2  # standardization × num_features × reg_param × max_iter
run_num = 1

"""
Possible parameters for comparison (according to the exercise description):
- standardization of training features (2 values): True or False
- number of features: 2000 (given in the exercise), 500 (much havier filtering)
- regularization parameter (3 values): 0.1, 0.01 or 0.001
- maximum number of iterations (2 values): 10 or 50
"""

total_start_time = time.time()

print(f"Starting grid search with {total_runs} configurations...\\n")

#iterating through the possible parameter combinations
for standardize in [True, False]:
    for num_features in [2000, 500]:
        for reg_param in [0.1, 0.01, 0.001]:
            for max_iter in [10, 50]:
                run_start = time.time()
                print(f"[{run_num}/{total_runs}] std={standardize}, features={num_features}, reg={reg_param}, iter={max_iter}")


                try: 
                    selector = ChiSqSelector(featuresCol="features", outputCol="selected_features", labelCol="indexedLabel", numTopFeatures=num_features)
                    normalizer = Normalizer(inputCol="selected_features", outputCol="norm_features", p=2.0)
                    
                    #create the Linear SVM model with the current parameters and using the one vs all strategy
                    svm = LinearSVC(featuresCol="norm_features", labelCol="indexedLabel", regParam=reg_param, maxIter=max_iter, standardization=standardize)
                    ovr = OneVsRest(classifier=svm, featuresCol="norm_features", labelCol="indexedLabel")
    
                    #adding the normalizer and the classifier to create full pipeline
                    pipeline = Pipeline(stages=[selector, normalizer, ovr])
                    
                    #train on train set
                    print("fitting model")
                    model = pipeline.fit(pre_train)

                    #evaluate on val set
                    print("predictions")
                    predictions = model.transform(pre_val)
                    print("evaluation")
                    f1 = evaluator.evaluate(predictions)

                    #save results
                    print("appending results")
                    results.append({
                        "standardization": standardize,
                        "num_features": num_features,
                        "reg_param": reg_param,
                        "max_iter": max_iter,
                        "f1_score": f1
                    })

                    duration = time.time() - run_start
                    print(f" → F1 score: {f1:.4f} (took {duration:.1f}s)\\n")

                except Exception as e:
                    print(f"!! Skipping due to error: {e}")

                run_num += 1

total_time = time.time() - total_start_time
print(f"Total grid search duration: {total_time/60:.2f} minutes")


Starting grid search with 24 configurations...\n
[1/24] std=True, features=2000, reg=0.1, iter=10
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5613 (took 37.3s)\n
[2/24] std=True, features=2000, reg=0.1, iter=50
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5449 (took 60.6s)\n
[3/24] std=True, features=2000, reg=0.01, iter=10
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5901 (took 24.1s)\n
[4/24] std=True, features=2000, reg=0.01, iter=50
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5651 (took 56.5s)\n
[5/24] std=True, features=2000, reg=0.001, iter=10
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5918 (took 22.8s)\n
[6/24] std=True, features=2000, reg=0.001, iter=50
fitting model
predictions
evaluation
appending results
 → F1 score: 0.5686 (took 53.8s)\n
[7/24] std=True, features=500, reg=0.1, iter=10
fitting model
predictions
evaluation
appending results
 → F1 sc

## Grid search results - best parameters

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="f1_score", ascending=False)

print("Grid Search Results:")
print(results_df)

results_df.to_csv("grid_search_results.csv", index=False)
print("Grid search results saved to grid_search_results.csv")

Grid Search Results:
    standardization  num_features  reg_param  max_iter  f1_score
4              True          2000      0.001        10  0.591755
2              True          2000      0.010        10  0.590108
5              True          2000      0.001        50  0.568629
3              True          2000      0.010        50  0.565129
0              True          2000      0.100        10  0.561325
10             True           500      0.001        10  0.555844
11             True           500      0.001        50  0.555489
8              True           500      0.010        10  0.555182
9              True           500      0.010        50  0.548708
1              True          2000      0.100        50  0.544945
6              True           500      0.100        10  0.535852
7              True           500      0.100        50  0.531492
22            False           500      0.001        10  0.490120
23            False           500      0.001        50  0.484537
16  

## Evaluate best model on test set

In [None]:
#best model on best parameters
best_params = results_df.iloc[0]
selector.setNumTopFeatures(best_params["num_features"])

svm = LinearSVC(featuresCol = "norm_features", labelCol = "indexedLabel", regParam = best_params["reg_param"], maxIter = int(best_params["max_iter"]), standardization = bool(best_params["standardization"]))
ovr = OneVsRest(classifier = svm, featuresCol = "norm_features", labelCol = "indexedLabel")

final_pipeline = Pipeline(stages = [tokenizer, remover, vectorizer, idf, selector, normalizer, ovr])
final_model = final_pipeline.fit(train)

#predict and evaluate on test set
test_predictions = final_model.transform(test)
f1_test = evaluator.evaluate(test_predictions)

print("Best configuration F1 on test set:", f1_test)

with open("final_model_results.txt", "w") as f:
    f.write("Best Configuration:\\n")
    f.write(str(best_params.to_dict()) + "\\n")
    f.write(f"F1 Score on Test Set: {f1_test:.4f}\\n")

print("Final model info saved to final_model_results.txt")


Best configuration F1 on test set: 0.5792334853581884
Final model info saved to final_model_results.txt


In [None]:
#stop Spark session
spark.stop()