## 1. Setup & Imports


In [1]:
import os
from pathlib import Path
import warnings
import zipfile

import rootutils

rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))
print(f"Project root: {ROOT_DIR}")

warnings.filterwarnings("ignore")

Project root: /Users/denys.koval/Labs/projects/amazon-reviews-analysis


## 2. Initialize Spark


In [None]:
from src.amazon_reviews_analysis.utils import build_spark

spark = build_spark()

print("âœ“ Spark Session created successfully!")
print(f"Spark Version: {spark.version}")
print(f"Spark App Name: {spark.sparkContext.appName}")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 23:22:04 WARN Utils: Your hostname, LT-W-7826.local, resolves to a loopback address: 127.0.0.1; using 192.168.31.164 instead (on interface en0)
25/12/02 23:22:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 23:22:04 WARN Utils: Your hostname, LT-W-7826.local, resolves to a loopback address: 127.0.0.1; using 192.168.31.164 instead (on interface en0)
25/12/02 23:22:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use

âœ“ Spark Session created successfully!
Spark Version: 4.0.1
Spark App Name: AmazonReviews
Spark Master: local[*]
Spark UI: http://192.168.31.164:4040


## 3. Load Data


In [None]:
DATA_ZIP = ROOT_DIR / "data/classification/classification_reviews.zip"
EXTRACT_DIR = ROOT_DIR / "data/classification/extracted"

if not EXTRACT_DIR.exists():
    print(f"ðŸ“¦ Extracting {DATA_ZIP}...")
    with zipfile.ZipFile(DATA_ZIP, "r") as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("âœ“ Extraction complete!")
else:
    print("âœ“ Data already extracted")

print(f"\nData location: {EXTRACT_DIR}")

âœ“ Data already extracted

Data location: /Users/denys.koval/Labs/projects/amazon-reviews-analysis/data/classification/extracted


In [None]:
df = spark.read.parquet(str(EXTRACT_DIR))

print(f"Total records: {df.count():,}")
print(f"\nColumns: {df.columns}")
df.printSchema()

                                                                                

Total records: 35,202,489

Columns: ['rating', 'title', 'text', 'verified_purchase', 'parent_asin', 'category_label', 'label']
root
 |-- rating: double (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- label: integer (nullable = true)



In [None]:
df.show(5, truncate=50)

+------+-------------------------------+--------------------------------------------------+-----------------+-----------+--------------+-----+
|rating|                          title|                                              text|verified_purchase|parent_asin|category_label|label|
+------+-------------------------------+--------------------------------------------------+-----------------+-----------+--------------+-----+
|   5.0|  Perfect for my granddaughters|               Just what my granddaughters wanted.|             true| B0771XZ99Y|        sports|    2|
|   3.0|             Pretty but Fragile|It makes an amusing popping sound but I thought...|             true| B0814BFFJH|        sports|    1|
|   5.0|                    Love these!|I had been searching for a while for some comfo...|             true| B08DXCXYK9|        sports|    2|
|   5.0|                     Five Stars|           They worked just as described. Thanks !|             true| B002QG1WJY|        sports|    2|

## 4. Data Exploration


In [None]:
# Check target distribution (label: 0=negative, 1=neutral, 2=positive)
df.groupBy("label").count().orderBy("label").show()



+-----+--------+
|label|   count|
+-----+--------+
|    0| 5372399|
|    1| 2451737|
|    2|27378353|
+-----+--------+



                                                                                

In [None]:
from pyspark.sql.functions import col, count, when, isnan

TEXT_COL = "text"
TARGET_COL = "label"  # 0=negative, 1=neutral, 2=positive



+----+-----+
|text|label|
+----+-----+
|   0|    0|
+----+-----+



                                                                                

## 5. Data Preprocessing


In [None]:
from pyspark.sql.functions import col

# Label is already 0, 1, 2 - just cast to double for MLlib
df_clean = df.withColumn("label", col(TARGET_COL).cast("double"))

print(f"Clean dataset: {df_clean.count():,} records")
print("\nLabel distribution (0=negative, 1=neutral, 2=positive):")
df_clean.groupBy("label").count().orderBy("label").show()

Clean dataset: 35,202,489 records

Label distribution (0=negative, 1=positive, 2=neutral):
+-----+--------+
|label|   count|
+-----+--------+
|  0.0| 5372399|
|  1.0| 2451737|
|  2.0|27378353|
+-----+--------+

+-----+--------+
|label|   count|
+-----+--------+
|  0.0| 5372399|
|  1.0| 2451737|
|  2.0|27378353|
+-----+--------+



In [9]:
# Train-Test Split
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_df.count():,} records")
print(f"Test set: {test_df.count():,} records")

                                                                                

Training set: 28,158,683 records




Test set: 7,043,806 records


                                                                                

## 6. Feature Engineering Pipeline


In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol=TEXT_COL, outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

print("âœ“ Feature transformers defined")

âœ“ Feature transformers defined


25/12/02 23:23:05 WARN StopWordsRemover: Default locale set was [en_UA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


## 7. Model Training


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=100, regParam=0.01)

pipeline = Pipeline(stages=[tokenizer, remover, hashing_tf, idf, lr])

print(f"Stages: {[stage.__class__.__name__ for stage in pipeline.getStages()]}")

âœ“ Pipeline defined
Stages: ['Tokenizer', 'StopWordsRemover', 'HashingTF', 'IDF', 'LogisticRegression']


In [12]:
print("ðŸš€ Training model...")
model = pipeline.fit(train_df)
print("âœ“ Training complete!")

ðŸš€ Training model...


25/12/02 23:35:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/12/02 23:35:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

âœ“ Training complete!


## 8. Model Evaluation


In [13]:
predictions = model.transform(test_df)

predictions.select(TEXT_COL, "label", "prediction", "probability").show(10, truncate=50)

[Stage 147:>                                                        (0 + 1) / 1]

+--------------------------------------------------+-----+----------+--------------------------------------------------+
|                                              text|label|prediction|                                       probability|
+--------------------------------------------------+-----+----------+--------------------------------------------------+
|DO NOT BUY!!!! I thought I was buying just a se...|  0.0|       0.0|[0.7779339377289416,0.09088079519457837,0.13118...|
|Not quite two years since purchasing this tread...|  0.0|       0.0|[0.9300297060449276,0.03207961970025403,0.03789...|
|Product did not preform as advertised.<br />The...|  0.0|       2.0|[0.1691622521435565,0.10854435878478087,0.72229...|
|Ordered this DONUT BEACH TOWEL..... Received a ...|  0.0|       0.0|[0.9756340349218156,0.015514565947463744,0.0088...|
|I dislike that socks are expensive and there is...|  0.0|       0.0|[0.707144940457979,0.10893299883598173,0.183922...|
|Horrible!  Somehow, their defin

                                                                                

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(predictions)

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)

evaluator_precision = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision"
)
precision = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall"
)
recall = evaluator_recall.evaluate(predictions)

print("RESULTS")
print(f"Accuracy:           {accuracy:.4f}")
print(f"F1 Score:           {f1:.4f}")
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall:    {recall:.4f}")



RESULTS
Accuracy:           0.8461
F1 Score:           0.8158
Weighted Precision: 0.8119
Weighted Recall:    0.8461


                                                                                

In [15]:
confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
print("Confusion Matrix:")
confusion_matrix.show(25)

Confusion Matrix:




+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|  0.0|       0.0| 589687|
|  0.0|       1.0|  15482|
|  0.0|       2.0| 470709|
|  1.0|       0.0| 104550|
|  1.0|       1.0|  25901|
|  1.0|       2.0| 360443|
|  2.0|       0.0| 105373|
|  2.0|       1.0|  27266|
|  2.0|       2.0|5344395|
+-----+----------+-------+



                                                                                

In [16]:
from pyspark.sql.functions import sum as spark_sum, when

per_class = predictions.groupBy("label").agg(
    count("*").alias("total"), spark_sum(when(col("label") == col("prediction"), 1).otherwise(0)).alias("correct")
)
per_class = per_class.withColumn("accuracy", col("correct") / col("total"))
per_class.orderBy("label").show()



+-----+-------+-------+-------------------+
|label|  total|correct|           accuracy|
+-----+-------+-------+-------------------+
|  0.0|1075878| 589687|  0.548098390337938|
|  1.0| 490894|  25901|0.05276291826748748|
|  2.0|5477034|5344395| 0.9757826955246215|
+-----+-------+-------+-------------------+



                                                                                

## 9. Save Model


In [17]:
MODEL_DIR = ROOT_DIR / "models" / "spark_lr_classifier"

model.write().overwrite().save(str(MODEL_DIR))

print(f"âœ“ Model saved to {MODEL_DIR}")

âœ“ Model saved to /Users/denys.koval/Labs/projects/amazon-reviews-analysis/models/spark_lr_classifier


## 10. Quick Inference Test


In [18]:
from pyspark.ml import PipelineModel

loaded_model = PipelineModel.load(str(MODEL_DIR))

sample_data = spark.createDataFrame(
    [
        ("This product is amazing! Best purchase I've ever made.",),
        ("Terrible quality, broke after one day. Don't buy!",),
        ("It's okay, nothing special but does the job.",),
    ],
    [TEXT_COL],
)

sample_predictions = loaded_model.transform(sample_data)

print("Sample Predictions:")
sample_predictions.select(TEXT_COL, "prediction").show(truncate=60)

25/12/02 23:58:04 WARN StopWordsRemover: Default locale set was [en_UA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
25/12/02 23:58:05 WARN StopWordsRemover: Default locale set was [en_UA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
25/12/02 23:58:05 WARN StopWordsRemover: Default locale set was [en_UA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


Sample Predictions:


                                                                                

+------------------------------------------------------+----------+
|                                                  text|prediction|
+------------------------------------------------------+----------+
|This product is amazing! Best purchase I've ever made.|       2.0|
|     Terrible quality, broke after one day. Don't buy!|       0.0|
|          It's okay, nothing special but does the job.|       2.0|
+------------------------------------------------------+----------+



In [19]:
spark.stop()