# Amazon Reviews 2023 - Category EDA

Reusable exploratory data analysis for any category. Update `CATEGORY_NAME` variable.

Analyzes: schema, data quality (nulls, duplicates, outliers), statistics, distributions.


## 1. Import Libraries


In [1]:
import os
from pathlib import Path
import warnings

import rootutils


rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))

REVIEWS_PATH = ROOT_DIR / "data/raw/review_categories/Musical_Instruments.jsonl"
METADATA_PATH = ROOT_DIR / "data/raw/meta_categories/meta_Musical_Instruments.jsonl"

warnings.filterwarnings("ignore")

## Initialize Spark


In [2]:
from amazon_reviews_analysis.utils import build_spark


spark = build_spark()

print("✓ Spark Session created successfully!")
print(f"Spark Version: {spark.version}")
print(f"Spark App Name: {spark.sparkContext.appName}")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/01 18:11:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/01 18:11:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


✓ Spark Session created successfully!
Spark Version: 4.0.1
Spark App Name: AmazonReviews
Spark Master: local[*]
Spark UI: http://IdeaPad-Pro-5-14AHP9:4041


---

# PART A: METADATA

## Load Metadata


In [3]:
from amazon_reviews_analysis.utils import load_metadata


print(f"📂 Metadata: {METADATA_PATH}")

metadata_df = load_metadata(spark, METADATA_PATH)
print(f"Total records: {metadata_df.count():,}")

📂 Metadata: /home/max/projects/nulp/amazon-reviews-analysis/data/raw/meta_categories/meta_Musical_Instruments.jsonl


[Stage 0:>                                                        (0 + 16) / 16]

Total records: 213,593


                                                                                

## Schema & Structure


In [4]:
print("SCHEMA")
print("=" * 80)
metadata_df.printSchema()

print(f"\nColumns: {len(metadata_df.columns)}")
for idx, col_name in enumerate(metadata_df.columns, 1):
    print(f"{idx:2d}. {col_name}")


SCHEMA
root
 |-- author: struct (nullable = true)
 |    |-- about: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- avatar: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- bought_together: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- details: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- hi_res: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- variant: string (nullable = true)
 |-- main_category: string (

## Sample Data


In [5]:
metadata_df.show(5, truncate=50)

+------+--------------+---------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-------------------+-----------+-----+-------------+---------+--------+--------------------------------------------------+--------------------------------------------------+
|author|average_rating|bought_together|                                        categories|                                       description|                                           details|                                          features|                                            images|      main_category|parent_asin|price|rating_number|    store|subtitle|                                             title|                                            videos|
+------+--------------+---------------+-------------------------

---

# PART B: REVIEWS

## Load Reviews


In [6]:
from amazon_reviews_analysis.utils import load_metadata

print(f"📂 Metadata: {REVIEWS_PATH}")

reviews_df = load_metadata(spark, REVIEWS_PATH)
print(f"Total records: {reviews_df.count():,}")

📂 Metadata: /home/max/projects/nulp/amazon-reviews-analysis/data/raw/review_categories/Musical_Instruments.jsonl
Total records: 3,017,439


## Schema & Structure


In [7]:
print("SCHEMA")
print("=" * 80)
reviews_df.printSchema()

print(f"\nColumns: {len(reviews_df.columns)}")
for idx, col_name in enumerate(reviews_df.columns, 1):
    print(f"{idx:2d}. {col_name}")

SCHEMA
root
 |-- author: struct (nullable = true)
 |    |-- about: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- avatar: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- bought_together: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- details: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- hi_res: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- variant: string (nullable = true)
 |-- main_category: string (

## Sample Data


In [8]:
reviews_df.show(5, truncate=50)

+------+--------------+---------------+----------+-----------+-------+--------+------+-------------+-----------+-----+-------------+-----+--------+------------------------------------------------+------+
|author|average_rating|bought_together|categories|description|details|features|images|main_category|parent_asin|price|rating_number|store|subtitle|                                           title|videos|
+------+--------------+---------------+----------+-----------+-------+--------+------+-------------+-----------+-----+-------------+-----+--------+------------------------------------------------+------+
|  NULL|          NULL|           NULL|      NULL|       NULL|   NULL|    NULL|    []|         NULL| B003LPTAYI| NULL|         NULL| NULL|    NULL|                                      Five Stars|  NULL|
|  NULL|          NULL|           NULL|      NULL|       NULL|   NULL|    NULL|    []|         NULL| B06XP6TDVY| NULL|         NULL| NULL|    NULL|nice sound.  pedal failed after less 