In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType


from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, StopWordsRemover, HashingTF, IDF, ChiSqSelector, CountVectorizer
from pyspark.ml.feature import Tokenizer, Normalizer, StringIndexer
from pyspark.ml.feature import SQLTransformer
from pyspark.sql.functions import col, lower
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator




from pyspark.mllib.util import MLUtils

import time

In [2]:
partial_dataset = "/user/dic24_shared/amazon-reviews/full/reviews_devset.json"
full_dataset = "/user/dic24_shared/amazon-reviews/full/reviewscombined.json"


In [3]:
#Initialize Spark Session
spark = SparkSession.builder \
    .appName("Text_Classification") \
    .getOrCreate()

try:
    df = spark.read.json(full_dataset).select("category", "reviewText")
    print("File read successfully.")
    df.printSchema()
    df.show(5)
    
except Exception as e:
    print(f"Error: {e}")

SLF4J: Class path contains multiple SLF4J bindings.

File read successfully.
root
 |-- category: string (nullable = true)
 |-- reviewText: string (nullable = true)

+--------------------+--------------------+
|            category|          reviewText|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|This was a gift f...|
|Patio_Lawn_and_Garde|My husband rarely...|
|Patio_Lawn_and_Garde|This guy knows hi...|
|Patio_Lawn_and_Garde|THIS CAN BE FOUND...|
|Patio_Lawn_and_Garde|Quite good partic...|
+--------------------+--------------------+
only showing top 5 rows



# Part 1: RDDs

# Part 2: 

In [4]:
# Casefolding
df = df.withColumn("reviewText", lower(col("reviewText")))

In [None]:
# Сonverting category to numeric
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Creating the pipeline:

# 1. tokenization
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=r'\s+|\t+|\d+|[(){}.!?,;:+=-_"\`~#@&*%€$§\\/\'-]+', gaps=True)

# 2. Stopwords removal
stopwords_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words")

# 3. tf-idf calculation with CountVectorizer
hashingTF = CountVectorizer(inputCol=stopwords_remover.getOutputCol(), outputCol="rawFeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

# # 4. TF-IDF calculation with HashingTF
# hashingTF = HashingTF(inputCol=stopwords_remover.getOutputCol(), outputCol="rawFeatures", numFeatures=20)
# idf = IDF(inputCol="rawFeatures", outputCol="features")

# 4. Chi-square
selector = ChiSqSelector(numTopFeatures=2000, featuresCol=idf.getOutputCol(),
                         outputCol="selectedFeatures", labelCol=indexer.getOutputCol())


pipeline = Pipeline(stages=[indexer, tokenizer, stopwords_remover, hashingTF, idf, selector])


start_time = time.time()

# Fit the model
model = pipeline.fit(df)

fit_time = time.time() - start_time
print(f'fit_time={fit_time}')

# Transform the test data
result = model.transform(df)

transform_time = time.time() - fit_time - start_time
print(f'transform_time={transform_time}')

# Extract the CountVectorizer model and ChiSqSelector model from the pipeline
cv_model = model.stages[3]
selector_model = model.stages[5]

# Get the vocabulary
vocab = cv_model.vocabulary

# Get the top 2000 selected features
top2000 = selector_model.selectedFeatures

# Map indices to terms and sort them
selected_terms = sorted([vocab[index] for index in top2000])
    
with open("output_ds.txt", "w") as f:
    for term in selected_terms:
        # print(term)
        f.write(f"{term} ")


In [None]:
len(selected_terms)