## 1. Setup & Imports


In [11]:
import os
from pathlib import Path
import warnings
import zipfile

import rootutils

os.environ["SPARK_LOCAL_DIRS"] = str(Path.home() / "spark-tmp")
rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))
print(f"Project root: {ROOT_DIR}")

warnings.filterwarnings("ignore")

Project root: /Users/olehyaiechnyk/PycharmProjects/amazon-reviews-analysis


In [12]:
import os

os.environ["PYSPARK_SUBMIT_ARGS"] = "--driver-memory 8g --conf spark.executor.memory=8g pyspark-shell"

JAVA17 = "/Library/Java/JavaVirtualMachines/temurin-17.jdk/Contents/Home"
os.environ["JAVA_HOME"] = JAVA17
os.environ["PATH"] = f"{JAVA17}/bin:" + os.environ["PATH"]

import subprocess
print(subprocess.run(["java", "-version"], capture_output=True, text=True).stderr)

openjdk version "17.0.15" 2025-04-15
OpenJDK Runtime Environment Temurin-17.0.15+6 (build 17.0.15+6)
OpenJDK 64-Bit Server VM Temurin-17.0.15+6 (build 17.0.15+6, mixed mode, sharing)



## 2. Initialize Spark


In [13]:
from src.amazon_reviews_analysis.utils import build_spark

spark = build_spark()

print("âœ“ Spark Session created successfully!")
print(f"Spark Version: {spark.version}")
print(f"Spark App Name: {spark.sparkContext.appName}")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

âœ“ Spark Session created successfully!
Spark Version: 4.0.1
Spark App Name: AmazonReviews
Spark Master: local[*]
Spark UI: http://172.20.10.5:4041


## 3. Load Data


In [14]:
DATA_ZIP = ROOT_DIR / "data/classification_reviews.zip"
EXTRACT_DIR = ROOT_DIR / "data/classification"

if not EXTRACT_DIR.exists():
    print(f"ðŸ“¦ Extracting {DATA_ZIP}...")
    with zipfile.ZipFile(DATA_ZIP, "r") as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("âœ“ Extraction complete!")
else:
    print("âœ“ Data already extracted")

print(f"\nData location: {EXTRACT_DIR}")

âœ“ Data already extracted

Data location: /Users/olehyaiechnyk/PycharmProjects/amazon-reviews-analysis/data/classification


In [15]:
df = spark.read.parquet(str(EXTRACT_DIR))

print(f"Total records: {df.count():,}")
print(f"\nColumns: {df.columns}")
df.printSchema()

Total records: 35,202,489

Columns: ['rating', 'title', 'text', 'verified_purchase', 'parent_asin', 'category_label', 'label']
root
 |-- rating: double (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- label: integer (nullable = true)



In [16]:
df.show(5, truncate=50)

+------+-------------------------------+--------------------------------------------------+-----------------+-----------+--------------+-----+
|rating|                          title|                                              text|verified_purchase|parent_asin|category_label|label|
+------+-------------------------------+--------------------------------------------------+-----------------+-----------+--------------+-----+
|   5.0|  Perfect for my granddaughters|               Just what my granddaughters wanted.|             true| B0771XZ99Y|        sports|    2|
|   3.0|             Pretty but Fragile|It makes an amusing popping sound but I thought...|             true| B0814BFFJH|        sports|    1|
|   5.0|                    Love these!|I had been searching for a while for some comfo...|             true| B08DXCXYK9|        sports|    2|
|   5.0|                     Five Stars|           They worked just as described. Thanks !|             true| B002QG1WJY|        sports|    2|

## 4. Data Exploration


In [17]:
# Check target distribution (label: 0=negative, 1=neutral, 2=positive)
df.groupBy("label").count().orderBy("label").show()



+-----+--------+
|label|   count|
+-----+--------+
|    0| 5372399|
|    1| 2451737|
|    2|27378353|
+-----+--------+



                                                                                

In [18]:
from pyspark.sql.functions import col, count, when, isnan

TEXT_COL = "text"
TARGET_COL = "label"  # 0=negative, 1=neutral, 2=positive

## 5. Data Preprocessing


In [19]:
from pyspark.sql.functions import col

# Label is already 0, 1, 2 - just cast to double for MLlib
df_clean = df.withColumn("label", col(TARGET_COL).cast("double"))

print(f"Clean dataset: {df_clean.count():,} records")
print("\nLabel distribution (0=negative, 1=neutral, 2=positive):")
df_clean.groupBy("label").count().orderBy("label").show()

Clean dataset: 35,202,489 records

Label distribution (0=negative, 1=neutral, 2=positive):




+-----+--------+
|label|   count|
+-----+--------+
|  0.0| 5372399|
|  1.0| 2451737|
|  2.0|27378353|
+-----+--------+



                                                                                

In [20]:
# Train-Test Split
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_df.count():,} records")
print(f"Test set: {test_df.count():,} records")

                                                                                

Training set: 28,158,683 records




Test set: 7,043,806 records


                                                                                

## 6. Feature Engineering Pipeline


In [21]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol=TEXT_COL, outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

print("âœ“ Feature transformers defined")

âœ“ Feature transformers defined


## 7. Model Training


In [22]:
from pyspark.ml.classification import LinearSVC, OneVsRest

svm = LinearSVC(
    maxIter=100,
    regParam=0.1,
)

ovr = OneVsRest(classifier=svm)

pipeline = Pipeline(stages=[tokenizer, remover, hashing_tf, idf, ovr])

print(f"Stages: {[stage.__class__.__name__ for stage in pipeline.getStages()]}")

Stages: ['Tokenizer', 'StopWordsRemover', 'HashingTF', 'IDF', 'OneVsRest']


In [23]:
train_df = train_df.sample(False, 0.3, seed=42)  # 30% of training data

print("ðŸš€ Training model...")
model = pipeline.fit(train_df)
print("âœ“ Training complete!")

ðŸš€ Training model...


25/12/03 12:01:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

âœ“ Training complete!


## 8. Model Evaluation


In [24]:
predictions = model.transform(test_df)

predictions.select(TEXT_COL, "label", "prediction").show(10, truncate=50)

[Stage 1606:>                                                       (0 + 1) / 1]

+--------------------------------------------------+-----+----------+
|                                              text|label|prediction|
+--------------------------------------------------+-----+----------+
|DO NOT BUY!!!! I thought I was buying just a se...|  0.0|       0.0|
|Not quite two years since purchasing this tread...|  0.0|       0.0|
|Product did not preform as advertised.<br />The...|  0.0|       2.0|
|Ordered this DONUT BEACH TOWEL..... Received a ...|  0.0|       0.0|
|I dislike that socks are expensive and there is...|  0.0|       0.0|
|Horrible!  Somehow, their definition of &#34;as...|  0.0|       0.0|
|We recently went on a two-week vacation camping...|  0.0|       0.0|
|Had to review it low. Box showed up, after very...|  0.0|       0.0|
|                      $17 over MSRP?!!! Seriously?|  0.0|       2.0|
|When you pay over $20 for a very simple 3" clea...|  0.0|       2.0|
+--------------------------------------------------+-----+----------+
only showing top 10 

                                                                                

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
)

accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1       = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall    = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print("RESULTS")
print(f"Accuracy:           {accuracy:.4f}")
print(f"F1 Score:           {f1:.4f}")
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall:    {recall:.4f}")



RESULTS
Accuracy:           0.8299
F1 Score:           0.7851
Weighted Precision: 0.7905
Weighted Recall:    0.8299


                                                                                

In [26]:
confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
print("Confusion Matrix:")
confusion_matrix.show(25)

Confusion Matrix:




+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|  0.0|       0.0| 435482|
|  0.0|       1.0|    433|
|  0.0|       2.0| 639963|
|  1.0|       0.0|  71746|
|  1.0|       1.0|    840|
|  1.0|       2.0| 418308|
|  2.0|       0.0|  66581|
|  2.0|       1.0|   1139|
|  2.0|       2.0|5409314|
+-----+----------+-------+



                                                                                

In [27]:
from pyspark.sql.functions import sum as spark_sum, when

per_class = predictions.groupBy("label").agg(
    count("*").alias("total"), spark_sum(when(col("label") == col("prediction"), 1).otherwise(0)).alias("correct")
)
per_class = per_class.withColumn("accuracy", col("correct") / col("total"))
per_class.orderBy("label").show()

                                                                                

+-----+-------+-------+--------------------+
|label|  total|correct|            accuracy|
+-----+-------+-------+--------------------+
|  0.0|1075878| 435482| 0.40476894220348403|
|  1.0| 490894|    840|0.001711163713551...|
|  2.0|5477034|5409314|  0.9876356436713739|
+-----+-------+-------+--------------------+



## 9. Save Model


In [28]:
MODEL_DIR = ROOT_DIR / "models" / "spark_svm_classifier"

model.write().overwrite().save(str(MODEL_DIR))

print(f"âœ“ Model saved to {MODEL_DIR}")

                                                                                

âœ“ Model saved to /Users/olehyaiechnyk/PycharmProjects/amazon-reviews-analysis/models/spark_svm_classifier


## 10. Quick Inference Test


In [29]:
from pyspark.ml import PipelineModel

loaded_model = PipelineModel.load(str(MODEL_DIR))

sample_data = spark.createDataFrame(
    [
        ("This product is amazing! Best purchase I've ever made.",),
        ("Terrible quality, broke after one day. Don't buy!",),
        ("It's okay, nothing special but does the job.",),
    ],
    [TEXT_COL],
)

sample_predictions = loaded_model.transform(sample_data)

print("Sample Predictions:")
sample_predictions.select(TEXT_COL, "prediction").show(truncate=60)

Sample Predictions:
+------------------------------------------------------+----------+
|                                                  text|prediction|
+------------------------------------------------------+----------+
|This product is amazing! Best purchase I've ever made.|       2.0|
|     Terrible quality, broke after one day. Don't buy!|       0.0|
|          It's okay, nothing special but does the job.|       2.0|
+------------------------------------------------------+----------+



In [30]:
spark.stop()