In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, CountVectorizer, IDF,
    ChiSqSelector, StringIndexer, Normalizer
)
from pyspark.ml.classification import (LinearSVC, OneVsRest)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


Starting Spark:

In [2]:
spark = SparkSession.builder.appName("TFIDF_pipeline").getOrCreate()


SLF4J: Class path contains multiple SLF4J bindings.

25/05/11 00:03:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Importing dataset, stopwords and delimiters

In [3]:
dataset_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
delimiters = r"""[ \t\n\d\(\)\[\]\{\}\.\!\?,;:\+=\-_\"'`~#@&\*\%€\$§\\/]+"""

In [4]:
df = spark.read.json(dataset_path)
df = df.select("reviewText", "category")
stopwords = set(open("stopwords.txt").read().splitlines())

Tokenization:

In [5]:
tokenizer = RegexTokenizer(
    inputCol="reviewText",
    outputCol="terms",
    pattern=delimiters,
    toLowercase=True
)

Stopwords Removal:

In [6]:
remover = StopWordsRemover(
    inputCol = "terms",
    outputCol = "terms_clean",
    stopWords = list(stopwords)
)

TF_IDF Calculation:

In [7]:
# Term frequency
c_vec = CountVectorizer(
    inputCol="terms_clean", 
    outputCol="rawFeatures"
)

# Inverse Document Frequency
idf = IDF(inputCol="rawFeatures", 
          outputCol="features"
)

Chi-square Selection:

In [8]:
# setting category column to numerical values using StringIndexer
label_indexer = StringIndexer(inputCol="category", outputCol="label")

selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="features",
    outputCol="selectedFeatures",
    labelCol="label"
)

Combining everything into Pipeline:

In [9]:
pipeline = Pipeline(
    stages=[
        tokenizer,
        remover,
        c_vec,
        idf,
        label_indexer,
        selector
])

In [10]:
model = pipeline.fit(df)
result = model.transform(df)

In [11]:
# CV model - third in the Pipeline stage.
cv_model = model.stages[2]
vocab = cv_model.vocabulary

# Selector model - sixth in the Pipeline stage.
selector_model = model.stages[5]
selected_indices = selector_model.selectedFeatures

selected_terms = [vocab[i] for i in selected_indices]

with open("output_ds.txt", "w", encoding="utf-8") as f:
    for term in sorted(selected_terms):
        f.write(term + "\n")

# Part 3: Text Classification

In [None]:


# Normalization
normalizer = Normalizer(inputCol="selectedFeatures", outputCol="normalizedFeatures", p=2.0)

# Classifier
svm = LinearSVC(featuresCol="normalizedFeatures", labelCol="label")
ovr = OneVsRest(classifier=svm, featuresCol="normalizedFeatures", labelCol="label")

# Define pipeline
pipeline = Pipeline(
    stages=[
        tokenizer,
        remover,
        c_vec,
        idf,
        label_indexer,
        selector,
        normalizer,
        ovr
])



In [None]:

# Grid search
param_grid = ParamGridBuilder() \
    .addGrid(selector.numTopFeatures, [50, 2000]) \
    .addGrid(svm.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(svm.standardization, [True, False]) \
    .addGrid(svm.maxIter, [10, 100]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

# Split data
trainDF, valDF, testDF = df.randomSplit([0.7, 0.2, 0.1], seed=42)



In [None]:

bestModel = None
bestF1    = 0.0

for params in param_grid:
    # copy pipeline & apply this set of hyper-params
    model = pipeline.copy(params).fit(trainDF)
    f1    = evaluator.evaluate(model.transform(valDF))
    if f1 > bestF1:
        bestF1, bestModel = f1, model

print("Best F1 on validation set:", bestF1)

# now test‐set performance
testPreds = bestModel.transform(testDF)
testF1    = evaluator.evaluate(testPreds)
print("F1 on test set:", testF1)