In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, StopWordsRemover, HashingTF, IDF, ChiSqSelector, CountVectorizer
from pyspark.ml.feature import Tokenizer, Normalizer, StringIndexer
from pyspark.sql.functions import col, lower

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.mllib.util import MLUtils

import time

In [17]:
partial_dataset = "/user/dic24_shared/amazon-reviews/full/reviews_devset.json"
full_dataset = "/user/dic24_shared/amazon-reviews/full/reviewscombined.json"


In [18]:
#Initialize Spark Session
spark = SparkSession.builder \
    .appName("Text_Classification") \
    .getOrCreate()

try:
    df = spark.read.json(partial_dataset)
    print("File read successfully.")
    
    # Show the schema and some data
    df.printSchema()
    df.show(5)
    
except Exception as e:
    print(f"Error: {e}")

File read successfully.
root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)

+----------+--------------------+-------+-------+--------------------+-----------+--------------+--------------------+-------------------+--------------+
|      asin|            category|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|            summary|unixReviewTime|
+----------+--------------------+-------+-------+--------------------+-----------+--------------+--------------------+-------------------+--------------+
|0981850006|Patio_Lawn_and_Garde| [6, 7]|    5.0|This wa

# Part 2: Datasets/DataFrames: Spark ML and Pipelines

In [19]:
# Casefolding
df = df.withColumn("reviewText", lower(col("reviewText")))

In [None]:
# Сonverting category to numeric
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Creating the pipeline:

# 1. tokenization
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=r'\s+|\t+|\d+|[(){}.!?,;:+=-_"\`~#@&*%€$§\\/]+', gaps=True)

# 3. Stopwords removal
stopwords_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words")

# 4. tf-idf calculation with CountVectorizer
hashingTF = CountVectorizer(inputCol=stopwords_remover.getOutputCol(), outputCol="rawFeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

# # 4. TF-IDF calculation with HashingTF
# hashingTF = HashingTF(inputCol=stopwords_remover.getOutputCol(), outputCol="rawFeatures", numFeatures=20)
# idf = IDF(inputCol="rawFeatures", outputCol="features")

# 5. Chi-square
selector = ChiSqSelector(numTopFeatures=2000, featuresCol=idf.getOutputCol(),
                         outputCol="selectedFeatures", labelCol=indexer.getOutputCol())


pipeline = Pipeline(stages=[indexer, tokenizer, stopwords_remover, hashingTF, idf, selector])


start_time = time.time()
model = pipeline.fit(df)
fit_time = time.time() - start_time
print(f'fit_time={fit_time}')

# Transform the test data
result = model.transform(df)
transform_time = time.time() - fit_time - start_time
print(f'transform_time={transform_time}')

# Extract the CountVectorizer model from the pipeline
count_vectorizer_model = model.stages[3]
vocab = count_vectorizer_model.vocabulary

# Get the selected feature indices
selected_indices = result.select("selectedFeatures").rdd.flatMap(lambda x: x).collect()

# Convert the indices to terms
selected_terms = [vocab[index] for indices in selected_indices for index in indices.indices]

    
with open("output_ds.txt", "w") as f:
    for term in selected_terms:
        # print(term)
        f.write(f"{term} ")
