# Industrial and Scientific Category - Data Exploration

**Category**: Industrial and Scientific


## 1. Import Libraries


In [1]:
import os
from pathlib import Path
import warnings

import rootutils


rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))
print(ROOT_DIR)


/Users/andriimyrosh/Projects/amazon-reviews-analysis


In [2]:
import os
from pathlib import Path
import warnings

import rootutils


rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)

ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))

REVIEWS_PATH = ROOT_DIR / "data/raw/review_categories/Industrial_and_Scientific.jsonl"
METADATA_PATH = ROOT_DIR / "data/raw/meta_categories/meta_Industrial_and_Scientific.jsonl"

warnings.filterwarnings("ignore")


## Initialize Spark


In [3]:
import os
import subprocess

# CRITICAL: Set JAVA_HOME BEFORE importing Spark
# This must be done in the notebook, not just in terminal

# Try multiple methods to find Java
java_home = None

# Method 1: Check if already set
if os.environ.get('JAVA_HOME'):
    java_home = os.environ['JAVA_HOME']
    print(f"‚úì JAVA_HOME already set to: {java_home}")
else:
    # Method 2: Try Homebrew
    try:
        brew_prefix = subprocess.check_output(
            ['brew', '--prefix', 'openjdk@17'], 
            text=True,
            stderr=subprocess.DEVNULL
        ).strip()
        if os.path.exists(brew_prefix):
            java_home = brew_prefix
            os.environ['JAVA_HOME'] = java_home
            print(f"‚úì Found Java via Homebrew: {java_home}")
    except:
        pass
    
    # Method 3: Try common locations
    if not java_home:
        common_paths = [
            '/opt/homebrew/opt/openjdk@17',
            '/usr/local/opt/openjdk@17',
            '/Library/Java/JavaVirtualMachines/temurin-17.jdk/Contents/Home',
            '/Library/Java/JavaVirtualMachines/jdk-17.jdk/Contents/Home'
        ]
        for path in common_paths:
            if os.path.exists(path):
                java_home = path
                os.environ['JAVA_HOME'] = java_home
                print(f"‚úì Found Java at: {java_home}")
                break
    
    # Method 4: Use /usr/libexec/java_home (macOS)
    if not java_home:
        try:
            java_home = subprocess.check_output(
                ['/usr/libexec/java_home', '-v', '17'],
                text=True,
                stderr=subprocess.DEVNULL
            ).strip()
            os.environ['JAVA_HOME'] = java_home
            print(f"‚úì Found Java via java_home: {java_home}")
        except:
            pass

# Verify Java is accessible
if java_home:
    java_bin = os.path.join(java_home, 'bin', 'java')
    if os.path.exists(java_bin):
        try:
            result = subprocess.run(
                [java_bin, '-version'],
                capture_output=True,
                text=True,
                stderr=subprocess.STDOUT,
                timeout=5
            )
            print(f"‚úì Java verification successful")
            print(f"  Version: {result.stdout.split(chr(10))[0] if result.stdout else 'N/A'}")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not verify Java: {e}")
    else:
        print(f"‚ö†Ô∏è  Java binary not found at: {java_bin}")
else:
    print("‚ùå ERROR: Could not find Java installation!")
    print("Please install Java 17: brew install openjdk@17")
    print("Or set JAVA_HOME manually in this cell:")
    print("  os.environ['JAVA_HOME'] = '/path/to/java'")

# Also add to PATH
if java_home:
    java_bin_dir = os.path.join(java_home, 'bin')
    current_path = os.environ.get('PATH', '')
    if java_bin_dir not in current_path:
        os.environ['PATH'] = f"{java_bin_dir}:{current_path}"
        print(f"‚úì Added Java to PATH")

‚úì Found Java via Homebrew: /opt/homebrew/opt/openjdk@17
‚ö†Ô∏è  Could not verify Java: stdout and stderr arguments may not be used with capture_output.
‚úì Added Java to PATH


In [4]:
from amazon_reviews_analysis.utils import build_spark

spark = build_spark()

print("‚úì Spark Session created successfully!")
print(f"Spark Version: {spark.version}")
print(f"Spark App Name: {spark.sparkContext.appName}")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/12 11:26:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


‚úì Spark Session created successfully!
Spark Version: 4.0.1
Spark App Name: AmazonReviews
Spark Master: local[*]
Spark UI: http://ip-192-168-0-101.eu-west-1.compute.internal:4040


---

# PART A: METADATA

## Load Metadata


In [5]:
from src.amazon_reviews_analysis.utils import load_metadata


print(f"üìÇ Metadata: {METADATA_PATH}")

metadata_df = load_metadata(spark, METADATA_PATH)
print(f"Total records: {metadata_df.count():,}")

üìÇ Metadata: /Users/andriimyrosh/Projects/amazon-reviews-analysis/data/raw/meta_categories/meta_Industrial_and_Scientific.jsonl


[Stage 0:>                                                        (0 + 12) / 12]

Total records: 427,564


                                                                                

## Schema & Structure


In [6]:
print("SCHEMA")
print("=" * 80)
metadata_df.printSchema()

print(f"\nColumns: {len(metadata_df.columns)}")
for idx, col_name in enumerate(metadata_df.columns, 1):
    print(f"{idx:2d}. {col_name}")


SCHEMA
root
 |-- author: struct (nullable = true)
 |    |-- about: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- avatar: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- bought_together: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- details: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- hi_res: string (nullable = true)
 |    |    |-- large: string (nullable = true)
 |    |    |-- thumb: string (nullable = true)
 |    |    |-- variant: string (nullable = true)
 |-- main_category: string (

## Sample Data


In [7]:
metadata_df.show(5, truncate=50)


+------+--------------+---------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+------------------------+-----------+-----+-------------+-----------------------------+--------+--------------------------------------------------+--------------------------------------------------+
|author|average_rating|bought_together|                                        categories|                                       description|                                           details|                                          features|                                            images|           main_category|parent_asin|price|rating_number|                        store|subtitle|                                             title|                                            videos|
+------+------

The table describes the range of products from Amazon (from the Industrial and Scientific category), including ratings, specifications, brands, images, etc.


---

# PART B: REVIEWS

## Load Reviews


In [8]:
from src.amazon_reviews_analysis.utils import load_reviews

print(f"üìÇ Reviews: {REVIEWS_PATH}")

reviews_df = load_reviews(spark, REVIEWS_PATH)
print(f"Total records: {reviews_df.count():,}")


üìÇ Reviews: /Users/andriimyrosh/Projects/amazon-reviews-analysis/data/raw/review_categories/Industrial_and_Scientific.jsonl




Total records: 5,183,005


                                                                                

## Schema & Structure


In [9]:
print("SCHEMA")
print("=" * 80)
reviews_df.printSchema()

print(f"\nColumns: {len(reviews_df.columns)}")
for idx, col_name in enumerate(reviews_df.columns, 1):
    print(f"{idx:2d}. {col_name}")


SCHEMA
root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- attachment_type: string (nullable = true)
 |    |    |-- large_image_url: string (nullable = true)
 |    |    |-- medium_image_url: string (nullable = true)
 |    |    |-- small_image_url: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)


Columns: 10
 1. asin
 2. helpful_vote
 3. images
 4. parent_asin
 5. rating
 6. text
 7. timestamp
 8. title
 9. user_id
10. verified_purchase


## Sample Data


In [10]:
reviews_df.show(5, truncate=50)


+----------+------------+------+-----------+------+--------------------------------------------------+-------------+----------------------------+----------------------------+-----------------+
|      asin|helpful_vote|images|parent_asin|rating|                                              text|    timestamp|                       title|                     user_id|verified_purchase|
+----------+------------+------+-----------+------+--------------------------------------------------+-------------+----------------------------+----------------------------+-----------------+
|B08C7HDF1F|           3|    []| B0BX2672L8|   5.0|These masks are great even though there is no '...|1676602453163|    Best value for the money|AG2L7H23R5LLKDKLBEF2Q3L2MVDA|             true|
|B07BT4YLHT|           1|    []| B07BT4YLHT|   5.0|These scissors are so good they got stolen by o...|1671844170434|                   TOO good.|AG2L7H23R5LLKDKLBEF2Q3L2MVDA|             true|
|B06XY65HCX|           0|    []| B0

The table contains information about user experiences, ratings, and comments about products. This data can be used to analyze sentiment, identify popular products, and correlate ratings with review text.


# DATA EXPLORATION


### MISSING VALUES


In [11]:
from pyspark.sql.functions import col, sum, when, size

print("=== METADATA: Missing Values Analysis ===")
total_meta = metadata_df.count()

# % missed data in each column
nulls = metadata_df.select([
    (sum(when(col(c).isNull(), 1).otherwise(0)) / total_meta).alias(c)
    for c in metadata_df.columns
])

nulls.show()

metadata_df.select([
    (sum(when(size(col(c)) == 0, 1).otherwise(0)) / total_meta).alias(f"{c}_empty")
    for c in ["categories", "images", "description", "features"]
]).show()


=== METADATA: Missing Values Analysis ===


                                                                                

+------------------+--------------+---------------+----------+-----------+-------+--------+------+-------------------+-----------+-----------------+-------------+--------------------+------------------+-----+------+
|            author|average_rating|bought_together|categories|description|details|features|images|      main_category|parent_asin|            price|rating_number|               store|          subtitle|title|videos|
+------------------+--------------+---------------+----------+-----------+-------+--------+------+-------------------+-----------+-----------------+-------------+--------------------+------------------+-----+------+
|0.9999181409098988|           0.0|            1.0|       0.0|        0.0|    0.0|     0.0|   0.0|0.04743851212917832|        0.0|0.478522513588609|          0.0|0.008073645115117269|0.9997988605214658|  0.0|   0.0|
+------------------+--------------+---------------+----------+-----------+-------+--------+------+-------------------+-----------+------

[Stage 14:>                                                       (0 + 12) / 12]

+-------------------+--------------------+------------------+-------------------+
|   categories_empty|        images_empty| description_empty|     features_empty|
+-------------------+--------------------+------------------+-------------------+
|0.08344247878680151|7.952025895538446E-5|0.3339991205994892|0.21496664826786166|
+-------------------+--------------------+------------------+-------------------+



                                                                                

METADATA: metadata_df table has a significant number of gaps, especially in columns related to content (description, features, categories) and price.


In [12]:
print("=== REVIEWS: Missing Values Analysis ===")

total_reviews = reviews_df.count()

nulls = reviews_df.select([
    (sum(when(col(c).isNull(), 1).otherwise(0)) / total_reviews).alias(c)
    for c in reviews_df.columns
])
nulls.show()


=== REVIEWS: Missing Values Analysis ===




+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+
|asin|helpful_vote|images|parent_asin|rating|text|timestamp|title|user_id|verified_purchase|
+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+
| 0.0|         0.0|   0.0|        0.0|   0.0| 0.0|      0.0|  0.0|    0.0|              0.0|
+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+



                                                                                

REVIEWS: reviews_df has high data quality ‚Äî all fields are filled in. This means the data can be used for further analysis without any problems.


### BASIC STATISTICS


In [13]:
metadata_df.describe().show()


25/11/12 11:27:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------+---------------+--------------+--------------------+------------------+-----------------+--------------------+--------------------+--------------------+
|summary|   average_rating|bought_together| main_category|         parent_asin|             price|    rating_number|               store|            subtitle|               title|
+-------+-----------------+---------------+--------------+--------------------+------------------+-----------------+--------------------+--------------------+--------------------+
|  count|           427564|              0|        407281|              427564|            222965|           427564|              424112|                  86|              427564|
|   mean|4.216573893031269|           NULL|          NULL|2.6673319940277777E9|60.794731433590826|95.00709835252734|4.880818106728346E10|                NULL|                 NaN|
| stddev|0.846767308326146|           NULL|          NULL|2.8000093148834596E9|471.50593680709363|87

                                                                                

METADATA: metadata_df shows that the products are mostly positively rated, but the spread in prices and number of ratings is large.
The price ranges indicate the presence of both budget and premium brands.


In [14]:
reviews_df.describe().show()




+-------+--------------------+-----------------+-------------------+------------------+--------------------+--------------------+-------+--------------------+
|summary|                asin|     helpful_vote|        parent_asin|            rating|                text|           timestamp|  title|             user_id|
+-------+--------------------+-----------------+-------------------+------------------+--------------------+--------------------+-------+--------------------+
|  count|             5183005|          5183005|            5183005|           5183005|             5183005|             5183005|5183005|             5183005|
|   mean|2.1785267420744963E9|0.932035180363515|2.142899837661592E9| 4.183227297677699|3.104128449519329...|1.564998718237724...|    NaN|                NULL|
| stddev| 2.224654697769768E9| 9.97075258843753| 2.21629437101025E9|1.3768442328925246|5.403323856320201E20| 8.31608136135859E10|    NaN|                NULL|
|    min|          0072823275|               -

                                                                                

REVIEWS: Most users leave positive reviews, but there are a number of low ratings. The review texts are of varying lengths, sometimes even emojis.


The average rating for both products and reviews is ‚âà 4, meaning positive ratings prevail.

The data varies by price, popularity, and publication period.


### EXPLORATION OF SOME COLUMNS


In [15]:
reviews_df.groupBy("rating").count().orderBy("rating").show()




+------+-------+
|rating|  count|
+------+-------+
|   1.0| 584133|
|   2.0| 234763|
|   3.0| 315792|
|   4.0| 560932|
|   5.0|3487385|
+------+-------+



                                                                                

In [16]:
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.sql.window import Window

rating_counts = (
    reviews_df.groupBy("rating")
    .count()
    .orderBy("rating")
)

# Display rating distribution table
print("Rating Distribution:")
print("=" * 50)
rating_counts.show()

# Calculate percentages for data cleaning decisions
total_reviews = reviews_df.count()
print(f"\nRating Distribution (Percentages & Cumulative):")
print("=" * 50)
rating_pct = rating_counts.withColumn(
    "percentage", 
    (col("count") / total_reviews * 100).cast("decimal(5,2)")
).withColumn(
    "cumulative_pct",
    (spark_sum(col("count")).over(
        Window.orderBy("rating").rowsBetween(Window.unboundedPreceding, Window.currentRow)
    ) / total_reviews * 100).cast("decimal(5,2)")
)
rating_pct.show()

# Text-based visualization for quick assessment
print("\nRating Distribution (Visual - 1 block = 50k reviews):")
print("=" * 50)
for row in rating_counts.collect():
    rating = int(row['rating'])
    count = row['count']
    pct = (count / total_reviews * 100)
    bar = '‚ñà' * (count // 50000)  # Scale: 1 block = 50k reviews
    print(f"{rating}.0: {bar:20s} {count:>10,} ({pct:>5.1f}%)")

# Data cleaning insights
print("\nüìä Data Cleaning Insights:")
print("=" * 50)
low_ratings = rating_counts.filter(col("rating") <= 2.0).agg(spark_sum("count").alias("low_count")).collect()[0]["low_count"]
high_ratings = rating_counts.filter(col("rating") >= 4.0).agg(spark_sum("count").alias("high_count")).collect()[0]["high_count"]
print(f"Low ratings (1-2 stars): {low_ratings:,} ({(low_ratings/total_reviews*100):.1f}%)")
print(f"High ratings (4-5 stars): {high_ratings:,} ({(high_ratings/total_reviews*100):.1f}%)")
print(f"Rating bias: {(high_ratings/total_reviews*100):.1f}% are 4-5 stars (typical for review data)")
print(f"\nüí° Cleaning Decision: Keep all ratings as they represent genuine user feedback.")

Rating Distribution:


                                                                                

+------+-------+
|rating|  count|
+------+-------+
|   1.0| 584133|
|   2.0| 234763|
|   3.0| 315792|
|   4.0| 560932|
|   5.0|3487385|
+------+-------+



25/11/12 11:27:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.



Rating Distribution (Percentages & Cumulative):


25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/12 11:27:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+-------+----------+--------------+
|rating|  count|percentage|cumulative_pct|
+------+-------+----------+--------------+
|   1.0| 584133|     11.27|         11.27|
|   2.0| 234763|      4.53|         15.80|
|   3.0| 315792|      6.09|         21.89|
|   4.0| 560932|     10.82|         32.71|
|   5.0|3487385|     67.29|        100.00|
+------+-------+----------+--------------+


Rating Distribution (Visual - 1 block = 50k reviews):


                                                                                

1.0: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà             584,133 ( 11.3%)
2.0: ‚ñà‚ñà‚ñà‚ñà                    234,763 (  4.5%)
3.0: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                  315,792 (  6.1%)
4.0: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà             560,932 ( 10.8%)
5.0: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  3,487,385 ( 67.3%)

üìä Data Cleaning Insights:




Low ratings (1-2 stars): 818,896 (15.8%)
High ratings (4-5 stars): 4,048,317 (78.1%)
Rating bias: 78.1% are 4-5 stars (typical for review data)

üí° Cleaning Decision: Keep all ratings as they represent genuine user feedback.


                                                                                

In [17]:
reviews_df.agg({'rating': 'avg'}).show()



+-----------------+
|      avg(rating)|
+-----------------+
|4.183227297677699|
+-----------------+



                                                                                

Most users give the highest rating ‚Äî 5.0.

This indicates either general user satisfaction or a bias in ratings (people are more likely to leave reviews when they are very satisfied).

The fewest reviews have a rating of 2.0. Low ratings are less common, which is also a typical effect ‚Äî dissatisfied users sometimes do not leave a review.

The distribution is asymmetric ‚Äî clearly biased towards high ratings.


In [18]:
reviews_df.groupBy("verified_purchase").count().show()




+-----------------+-------+
|verified_purchase|  count|
+-----------------+-------+
|             true|4909835|
|            false| 273170|
+-----------------+-------+



                                                                                

Most reviews (‚âà90%) are verified by purchase, i.e. left by users who actually purchased the product.

Only about 10% are unverified, which could mean:

1. the user left the review without purchasing through another platform;

2. or potentially less reliable reviews (bots, advertising, subjective opinions).


In [19]:
from pyspark.sql.functions import length

reviews_df.select(length("text").alias("text_length")).describe().show() # description of review text lengths




+-------+------------------+
|summary|       text_length|
+-------+------------------+
|  count|           5183005|
|   mean|176.26756852443708|
| stddev| 283.7600258038013|
|    min|                 0|
|    max|             33276|
+-------+------------------+



                                                                                

Average length ~173 characters ‚Üí most users leave short, concise comments

Large standard deviation (~247) ‚Üí text length varies greatly - there are both very short and extremely long reviews.


In [20]:
reviews_df.groupBy("user_id", "asin", "timestamp").count().orderBy("count", ascending=False).show(5) # potential duplicate or suspicious entries


[Stage 81:>                                                       (0 + 12) / 13]

+--------------------+----------+-------------+-----+
|             user_id|      asin|    timestamp|count|
+--------------------+----------+-------------+-----+
|AGALPU5ARZEK75CGK...|B07SYB2BFW|1616840817793|   27|
|AHK2K4QJG2LOFDO24...|B07S8K4F5J|1642106258544|   11|
|AHJETSJDQNQDIRL66...|B01LXYM03A|1551424666757|   10|
|AG7QXEUHBSFEUUW46...|B01C5QR4HS|1490123759000|   10|
|AFFKTOSWUZCSSHHMT...|B076KNYCZ6|1626279666858|   10|
+--------------------+----------+-------------+-----+
only showing top 5 rows


                                                                                

The same user left several (up to 10) identical entries about the same product at the same time.


In [21]:
from pyspark.sql.functions import count, col, sum as spark_sum

# Detailed duplicate analysis for cleaning decisions
duplicates = reviews_df.groupBy("user_id", "asin", "timestamp").count().filter("count > 1")
duplicate_count = duplicates.count()

print(f"Duplicate groups found: {duplicate_count:,}")

# Show examples of duplicates
if duplicate_count > 0:
    print("\nSample duplicate entries:")
    print("=" * 60)
    duplicates.orderBy(col("count").desc()).show(10, truncate=False)
    
    # Analyze duplicate patterns
    max_duplicates = duplicates.agg({"count": "max"}).collect()[0]["max(count)"]
    avg_duplicates = duplicates.agg({"count": "avg"}).collect()[0]["avg(count)"]
    print(f"\nDuplicate Statistics:")
    print(f"  Maximum duplicates in one group: {max_duplicates}")
    print(f"  Average duplicates per group: {avg_duplicates:.2f}")
    
    # Estimate total duplicate records
    total_duplicate_records = duplicates.agg(spark_sum("count")).collect()[0]["sum(count)"]
    total_reviews = reviews_df.count()
    print(f"  Estimated duplicate records: {total_duplicate_records:,}")
    print(f"  Percentage of data that are duplicates: {(total_duplicate_records/total_reviews*100):.2f}%")
    print(f"\nüí° Cleaning Decision: Remove duplicates using window function (keep first occurrence).")
else:
    print("‚úì No duplicates found!")

                                                                                

Duplicate groups found: 45,098

Sample duplicate entries:


                                                                                

+----------------------------+----------+-------------+-----+
|user_id                     |asin      |timestamp    |count|
+----------------------------+----------+-------------+-----+
|AGALPU5ARZEK75CGKYELK232AHGA|B07SYB2BFW|1616840817793|27   |
|AHK2K4QJG2LOFDO24VTVMLH3SDKA|B07S8K4F5J|1642106258544|11   |
|AFFKTOSWUZCSSHHMTMYPMQJTUAKQ|B076KNYCZ6|1626279666858|10   |
|AG7QXEUHBSFEUUW46T7ECEDJSNHQ|B01C5QR4HS|1490123759000|10   |
|AHJETSJDQNQDIRL66CV5LO26UNUQ|B01LXYM03A|1551424666757|10   |
|AEP3ESK7PUBKZGHW3HZKAWHQWW5A|B01AGQ3RVQ|1617291783217|9    |
|AFI6SGYRSOTESKN26JO53BGBZB6A|B0B1HVR5LL|1670853016471|9    |
|AGRTAIXLOKGMMKT77XPBHUQ3Y33A|B00BUF0YLO|1603564758066|9    |
|AHYVEMREIHVJYUXFD5HPAJ5FWMLA|B0B19W7SY8|1686431145567|9    |
|AEAP7AJTQY65MVRHKPP2MMUYDU6Q|B07MQBQ7Y5|1681771589179|9    |
+----------------------------+----------+-------------+-----+
only showing top 10 rows


                                                                                


Duplicate Statistics:
  Maximum duplicates in one group: 27
  Average duplicates per group: 2.18




  Estimated duplicate records: 98,090
  Percentage of data that are duplicates: 1.89%

üí° Cleaning Decision: Remove duplicates using window function (keep first occurrence).


                                                                                

In [22]:
from pyspark.sql.functions import approx_count_distinct, count as spark_count

# Unique counts for data quality assessment
unique_stats = reviews_df.select(
    approx_count_distinct("asin").alias("unique_products"),
    approx_count_distinct("user_id").alias("unique_users"),
    approx_count_distinct("parent_asin").alias("unique_parent_products")
).collect()[0]

total_reviews = reviews_df.count()

print("Unique Entity Counts:")
print("=" * 50)
print(f"Unique products (asin):        {unique_stats['unique_products']:,}")
print(f"Unique parent products:       {unique_stats['unique_parent_products']:,}")
print(f"Unique users:                  {unique_stats['unique_users']:,}")
print(f"Total reviews:                 {total_reviews:,}")
print()
print("Average Statistics:")
print("=" * 50)
print(f"Reviews per product:            {(total_reviews/unique_stats['unique_products']):.2f}")
print(f"Reviews per parent product:     {(total_reviews/unique_stats['unique_parent_products']):.2f}")
print(f"Reviews per user:               {(total_reviews/unique_stats['unique_users']):.2f}")
print(f"\nüí° Cleaning Decision: These ratios help identify potential data quality issues.")




Unique Entity Counts:
Unique products (asin):        566,640
Unique parent products:       433,651
Unique users:                  3,603,690
Total reviews:                 5,183,005

Average Statistics:
Reviews per product:            9.15
Reviews per parent product:     11.95
Reviews per user:               1.44

üí° Cleaning Decision: These ratios help identify potential data quality issues.


                                                                                

If we compare it to the total number of reviews, we can conclude the average reviews per user and per product.


In [23]:
print("=== Price Analysis ===")
# Note: price is stored as string, need to cast for ordering
# Use try_cast to handle invalid price values (returns NULL instead of error)
metadata_df.filter(col("price").isNotNull()) \
    .withColumn("price_double", col("price").try_cast("double")) \
    .filter(col("price_double").isNotNull()) \
    .select("price", "title", "average_rating") \
    .orderBy(col("price_double").desc()) \
    .show(10, truncate=50)


from pyspark.sql.functions import when
# Cast price to double before comparison (price is stored as string)
# Use try_cast to handle invalid price values gracefully
price_ranges = metadata_df.filter(col("price").isNotNull()) \
    .withColumn("price_double", col("price").try_cast("double")) \
    .filter(col("price_double").isNotNull()) \
    .withColumn("price_range",
        when(col("price_double") < 10, "Budget (<$10)")
        .when(col("price_double") < 25, "Mid ($10-25)")
        .when(col("price_double") < 50, "Premium ($25-50)")
        .otherwise("Luxury (>$50)")
    ) \
    .groupBy("price_range").count() \
    .orderBy("count", ascending=False)

print("\nPrice Range Distribution:")
price_ranges.show()

=== Price Analysis ===


                                                                                

+---------+--------------------------------------------------+--------------+
|    price|                                             title|average_rating|
+---------+--------------------------------------------------+--------------+
|129266.64|Senco 08S250W592 2-1/2" x #8 Duraspin Collated ...|           3.5|
|124024.12|Merit Glue Bond Refill for 350-RP UNSCORED, Alu...|           4.0|
|  49999.0|JG MAKER Industrial SLA 3D Printer JG-A600 Larg...|           5.0|
| 32999.99|On/Go One COVID-19 Rapid Antigen Home Test, 1 P...|           5.0|
| 30769.65|                 Weight Set(20), 50 kg-1 g, ASTM 1|           5.0|
| 25666.25|     SPX 2 JAWPULLER, 100 TON, Universal (PH1002J)|           5.0|
| 21385.47|GOLEHS Osmium (Os) Density Cube, Laboratory-Gra...|           1.0|
| 15334.52|Starrett 123Z-72 Vernier Caliper, Steel, Nib St...|           3.0|
| 15183.99|OTC 1854 100-Ton Capacity Shop Press with Elect...|           5.0|
| 14530.99|Fluke Networks DSX2-8000 CableAnalyzer Copper C...|  

[Stage 119:>                                                      (0 + 12) / 12]

+----------------+-----+
|     price_range|count|
+----------------+-----+
|    Mid ($10-25)|83634|
|   Budget (<$10)|54165|
|   Luxury (>$50)|44951|
|Premium ($25-50)|40193|
+----------------+-----+



                                                                                

The data covers a very wide price range, from budget to luxury goods.

The most popular price segment varies by category.

A significant proportion of products across different price ranges.


In [24]:
from pyspark.sql.functions import avg, count

# Cast price to double before comparison (price is stored as string)
# Use try_cast to handle invalid price values gracefully
price_stats = metadata_df.filter(col("price").isNotNull()) \
    .withColumn("price_double", col("price").try_cast("double")) \
    .filter(col("price_double").isNotNull()) \
    .withColumn("price_range",
        when(col("price_double") < 10, "Budget (<$10)")
        .when(col("price_double") < 25, "Mid ($10-25)")
        .when(col("price_double") < 50, "Premium ($25-50)")
        .otherwise("Luxury (>$50)")
    ) \
    .groupBy("price_range") \
    .agg(
        count("*").alias("product_count"),
        avg("average_rating").alias("avg_rating")
    ) \
    .orderBy("avg_rating", ascending=False)

print("\nPrice Range Statistics (Average Rating by Price Range):")
price_stats.show()


Price Range Statistics (Average Rating by Price Range):


[Stage 122:>                                                      (0 + 12) / 12]

+----------------+-------------+------------------+
|     price_range|product_count|        avg_rating|
+----------------+-------------+------------------+
|Premium ($25-50)|        40193|4.3402831338790415|
|    Mid ($10-25)|        83634|4.3276215414783366|
|   Budget (<$10)|        54165| 4.280635096464511|
|   Luxury (>$50)|        44951| 4.266677048341532|
+----------------+-------------+------------------+



                                                                                

In [25]:
from pyspark.sql.functions import corr, col

# Calculate correlations using Spark SQL (for data cleaning decisions)
print("Correlation Analysis (for data quality assessment):")
print("=" * 60)

# Filter out nulls and cast price to double (price is stored as string)
metadata_with_numeric = metadata_df.filter(
    col("average_rating").isNotNull() &
    col("rating_number").isNotNull() &
    col("price").isNotNull()
).withColumn("price_double", col("price").try_cast("double"))

# Filter out any rows where price casting failed (invalid price values)
metadata_with_numeric = metadata_with_numeric.filter(col("price_double").isNotNull())

total_with_all_numeric = metadata_with_numeric.count()
total_metadata = metadata_df.count()
print(f"Products with all numeric fields: {total_with_all_numeric:,} / {total_metadata:,} ({(total_with_all_numeric/total_metadata*100):.1f}%)")
print()

# Calculate pairwise correlations (using the cast price column)
correlations = metadata_with_numeric.select(
    corr("average_rating", "rating_number").alias("avg_rating_vs_rating_number"),
    corr("average_rating", "price_double").alias("avg_rating_vs_price"),
    corr("rating_number", "price_double").alias("rating_number_vs_price")
).collect()[0]

print("Correlation Matrix:")
print("                  | Rating Number | Price")
print("------------------|---------------|--------")
print(f"Average Rating    | {correlations['avg_rating_vs_rating_number']:>13.4f} | {correlations['avg_rating_vs_price']:>6.4f}")
print(f"Rating Number     | {1.0:>13.4f} | {correlations['rating_number_vs_price']:>6.4f}")
print(f"Price             | {correlations['rating_number_vs_price']:>13.4f} | {1.0:>6.4f}")

print("\nüìä Data Cleaning Insights:")
print("=" * 60)
print(f"‚Ä¢ Average Rating vs Rating Number: {correlations['avg_rating_vs_rating_number']:.4f}")
print("  ‚Üí Low correlation suggests ratings are independent of review volume")
print(f"‚Ä¢ Average Rating vs Price: {correlations['avg_rating_vs_price']:.4f}")
print("  ‚Üí Low correlation suggests price doesn't strongly predict rating")
print(f"‚Ä¢ Rating Number vs Price: {correlations['rating_number_vs_price']:.4f}")
print("  ‚Üí Indicates relationship between product popularity and price")
print(f"\nüí° Cleaning Decision: All correlations are reasonable. No data quality issues detected.")


Correlation Analysis (for data quality assessment):


                                                                                

Products with all numeric fields: 222,943 / 427,564 (52.1%)

Correlation Matrix:
                  | Rating Number | Price
------------------|---------------|--------
Average Rating    |        0.0317 | -0.0202
Rating Number     |        1.0000 | -0.0078
Price             |       -0.0078 | 1.0000

üìä Data Cleaning Insights:
‚Ä¢ Average Rating vs Rating Number: 0.0317
  ‚Üí Low correlation suggests ratings are independent of review volume
‚Ä¢ Average Rating vs Price: -0.0202
  ‚Üí Low correlation suggests price doesn't strongly predict rating
‚Ä¢ Rating Number vs Price: -0.0078
  ‚Üí Indicates relationship between product popularity and price

üí° Cleaning Decision: All correlations are reasonable. No data quality issues detected.


                                                                                

The average rating is almost independent of price and number of reviews.

The conclusions of the previous analysis on price_range are confirmed: more expensive products receive slightly better ratings, but the overall dependence is weak.


## CONCLUSIONS

### Data quality and completeness

1. There are many missing or empty values ‚Äã‚Äãin metadata_df.

2. There are no missing values ‚Äã‚Äãin reviews_df, the data is more structured and complete.

### Basic statistics on goods

1. Average product rating varies by category.

2. Price ranges vary significantly.

3. main_category ‚Äî Industrial and Scientific

4. Data covers a wide range of prices and ratings.

### Reviews analysis

1. Number of reviews varies by category.

2. Most reviews are 5 stars, many are positive, fewer are negative (1‚Äì2 stars).

3. Verified purchase: ~90% confirmed by purchase ‚Üí most reviews are reliable.

4. Text length: average varies, maximum can be very long ‚Üí high variability, there are short and extremely long reviews.

5. Repeated entries by the same user for the same product at the same time may be detected ‚Üí duplicate entries need to be cleaned.

6. Reviews are generally reliable, positive, but there are anomalies (repeated entries, empty texts) that should be removed before in-depth analysis.

### Price analysis

1. More expensive products may receive slightly higher ratings, but the overall relationship is weak.

2. The bulk of products vary by category.

3. The correlation between price and rating is typically low ‚Üí price is not a strong factor for evaluation.


# DATA PREPARATION


In [26]:
print("=" * 70)
print("DATA CLEANING - REVIEWS")
print("=" * 70)

print("\n1. Checking for duplicates...")
original_review_count = reviews_df.count()
print(f"Original count: {original_review_count:,}")

duplicates = reviews_df.groupBy("user_id", "asin", "timestamp") \
    .count() \
    .filter(col("count") > 1) \
    .orderBy(col("count").desc())

duplicate_groups = duplicates.count()
print(f"Duplicate groups found: {duplicate_groups:,}")

if duplicate_groups > 0:
    # Show top duplicate patterns
    print("\nTop 10 duplicate patterns:")
    duplicates.show(10, truncate=False)
    
    # Calculate impact
    total_duplicate_records = duplicates.agg(spark_sum("count")).collect()[0]["sum(count)"]
    print(f"Total duplicate records: {total_duplicate_records:,}")
    print(f"Impact: {(total_duplicate_records/original_review_count*100):.2f}% of data")


DATA CLEANING - REVIEWS

1. Checking for duplicates...


                                                                                

Original count: 5,183,005


                                                                                

Duplicate groups found: 45,098

Top 10 duplicate patterns:


                                                                                

+----------------------------+----------+-------------+-----+
|user_id                     |asin      |timestamp    |count|
+----------------------------+----------+-------------+-----+
|AGALPU5ARZEK75CGKYELK232AHGA|B07SYB2BFW|1616840817793|27   |
|AHK2K4QJG2LOFDO24VTVMLH3SDKA|B07S8K4F5J|1642106258544|11   |
|AHJETSJDQNQDIRL66CV5LO26UNUQ|B01LXYM03A|1551424666757|10   |
|AG7QXEUHBSFEUUW46T7ECEDJSNHQ|B01C5QR4HS|1490123759000|10   |
|AFFKTOSWUZCSSHHMTMYPMQJTUAKQ|B076KNYCZ6|1626279666858|10   |
|AHYVEMREIHVJYUXFD5HPAJ5FWMLA|B0BLYW7YKH|1693071873887|9    |
|AEP3ESK7PUBKZGHW3HZKAWHQWW5A|B01AGQ3RVQ|1617291783217|9    |
|AGRTAIXLOKGMMKT77XPBHUQ3Y33A|B00BUF0YLO|1603564758066|9    |
|AFI6SGYRSOTESKN26JO53BGBZB6A|B0B1HVR5LL|1670853016471|9    |
|AEAP7AJTQY65MVRHKPP2MMUYDU6Q|B07MQBQ7Y5|1681771589179|9    |
+----------------------------+----------+-------------+-----+
only showing top 10 rows




Total duplicate records: 98,090
Impact: 1.89% of data


                                                                                

In [27]:
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

print("\n2. Removing duplicates (keeping first occurrence)...")

window_spec = Window.partitionBy("user_id", "asin", "timestamp") \
                    .orderBy("rating")

reviews_cleaned = reviews_df.withColumn(
    "row_num",
    row_number().over(window_spec)
).filter(col("row_num") == 1).drop("row_num")

cleaned_count = reviews_cleaned.count()
removed = original_review_count - cleaned_count
print(f"After removing duplicates: {cleaned_count:,}")
print(f"Removed: {removed:,} duplicate records ({(removed/original_review_count*100):.2f}%)")



2. Removing duplicates (keeping first occurrence)...


[Stage 154:>                                                      (0 + 12) / 13]

After removing duplicates: 5,130,013
Removed: 52,992 duplicate records (1.02%)


                                                                                

In [28]:
from pyspark.sql.functions import col, length, trim

print("\n3. Checking for empty/invalid reviews...")

empty_text = reviews_cleaned.filter(
    (col("text").isNull()) |
    (length(trim(col("text"))) == 0)
).count()

print(f"Reviews with empty text: {empty_text:,} ({(empty_text/cleaned_count*100):.2f}%)")

# Show sample empty reviews before removing
if empty_text > 0:
    print("\nSample empty reviews:")
    reviews_cleaned.filter(
        (col("text").isNull()) |
        (length(trim(col("text"))) == 0)
    ).select("asin", "rating", "title", "text").show(5, truncate=False)

reviews_cleaned = reviews_cleaned.filter(
    (col("text").isNotNull()) &
    (length(trim(col("text"))) > 0)
)

final_review_count = reviews_cleaned.count()
removed_empty = cleaned_count - final_review_count
print(f"\nAfter removing empty texts: {final_review_count:,}")
print(f"Removed: {removed_empty:,} empty reviews ({(removed_empty/cleaned_count*100):.2f}%)")



3. Checking for empty/invalid reviews...


                                                                                

Reviews with empty text: 5,086 (0.10%)

Sample empty reviews:


                                                                                

+----------+------+---------------------------------------------------------------------------+----+
|asin      |rating|title                                                                      |text|
+----------+------+---------------------------------------------------------------------------+----+
|B07PVVJWHK|1.0   |Used                                                                       |    |
|B07TCD93NK|5.0   |Perfect for my classroom!                                                  |    |
|B00AWRR662|5.0   |Was vey easy to use and very strong                                        |    |
|B094J2B328|5.0   |This alone helped me a lot during my move out. Thanks                      |    |
|B09B9F4KVZ|1.0   |This is the 2nd pair of shears that broke from my pack in not even a month.|    |
+----------+------+---------------------------------------------------------------------------+----+
only showing top 5 rows





After removing empty texts: 5,124,927
Removed: 5,086 empty reviews (0.10%)


                                                                                

In [29]:
print("\n4. Checking rating validity...")

invalid_ratings = reviews_cleaned.filter(
    (col("rating") < 1.0) |
    (col("rating") > 5.0)
).count()

print(f"Invalid ratings (outside 1-5 range): {invalid_ratings}")

if invalid_ratings > 0:
    print("\n‚ö†Ô∏è  WARNING: Found invalid ratings!")
    reviews_cleaned.filter(
        (col("rating") < 1.0) |
        (col("rating") > 5.0)
    ).select("asin", "rating", "title").show(10, truncate=False)
    print("\nüí° Cleaning Decision: These should be removed or corrected.")
else:
    print("‚úì All ratings are within valid range (1.0 - 5.0)")



4. Checking rating validity...




Invalid ratings (outside 1-5 range): 0
‚úì All ratings are within valid range (1.0 - 5.0)


                                                                                

In [30]:
print("\n" + "=" * 70)
print("DATA CLEANING - METADATA")
print("=" * 70)

print("\n1. Checking metadata duplicates...")
original_meta_count = metadata_df.count()
print(f"Original count: {original_meta_count:,}")

metadata_cleaned = metadata_df.dropDuplicates(["parent_asin"])

cleaned_meta_count = metadata_cleaned.count()
removed_meta = original_meta_count - cleaned_meta_count
print(f"After removing duplicates: {cleaned_meta_count:,}")
print(f"Removed: {removed_meta:,} duplicate products ({(removed_meta/original_meta_count*100):.2f}%)")



DATA CLEANING - METADATA

1. Checking metadata duplicates...


                                                                                

Original count: 427,564


[Stage 182:>                                                      (0 + 12) / 12]

After removing duplicates: 427,564
Removed: 0 duplicate products (0.00%)


                                                                                

In [31]:
print("\n2. Checking products without title...")

no_title = metadata_cleaned.filter(
    (col("title").isNull()) |
    (length(trim(col("title"))) == 0)
).count()

print(f"Products without title: {no_title} ({(no_title/cleaned_meta_count*100):.2f}%)")

if no_title > 0:
    print("\nSample products without title:")
    metadata_cleaned.filter(
        (col("title").isNull()) |
        (length(trim(col("title"))) == 0)
    ).select("parent_asin", "title", "average_rating").show(5, truncate=False)
    print("\nüí° Cleaning Decision: Remove products without titles as they're not useful.")
else:
    print("‚úì All products have titles")



2. Checking products without title...


                                                                                

Products without title: 25 (0.01%)

Sample products without title:


[Stage 194:>                                                      (0 + 12) / 12]

+-----------+-----+--------------+
|parent_asin|title|average_rating|
+-----------+-----+--------------+
|B087722LS3 |     |3.6           |
|B003XEY1DW |     |5.0           |
|B01BFJQS64 |     |4.4           |
|B0073B6RCY |     |5.0           |
|B00A8OQ71E |     |4.1           |
+-----------+-----+--------------+
only showing top 5 rows

üí° Cleaning Decision: Remove products without titles as they're not useful.


                                                                                

In [32]:
metadata_cleaned = metadata_cleaned.filter(
    (col("title").isNotNull()) &
    (length(trim(col("title"))) > 0)
)


In [33]:
print("\n3. Checking price anomalies...")

# Check for very low prices (potential data quality issues)
very_low_prices = metadata_cleaned.filter(
    col("price").isNotNull() & 
    (col("price").try_cast("double") < 0.1)
).count()

print(f"Products with price < $0.10: {very_low_prices}")

# Show price distribution
price_stats = metadata_cleaned.filter(col("price").isNotNull()) \
    .select(
        col("price").try_cast("double").alias("price_num")
    ) \
    .describe()

print("\nPrice Statistics:")
price_stats.show()

# Show lowest and highest prices
print("\nLowest prices (potential anomalies):")
metadata_cleaned.filter(col("price").isNotNull()) \
    .select("price", "title", "average_rating") \
    .orderBy(col("price").try_cast("double")) \
    .show(10, truncate=50)

print("\nHighest prices:")
metadata_cleaned.filter(col("price").isNotNull()) \
    .select("price", "title", "average_rating") \
    .orderBy(col("price").try_cast("double").desc()) \
    .show(10, truncate=50)

print("\nüí° Cleaning Decision: Review extreme prices manually. Very low prices (<$0.10) may be data errors.")



3. Checking price anomalies...


                                                                                

Products with price < $0.10: 6

Price Statistics:


                                                                                

+-------+-----------------+
|summary|        price_num|
+-------+-----------------+
|  count|           222931|
|   mean|60.79625920127254|
| stddev|471.5184303124185|
|    min|             0.01|
|    max|        129266.64|
+-------+-----------------+


Lowest prices (potential anomalies):


                                                                                

+----------+--------------------------------------------------+--------------+
|     price|                                             title|average_rating|
+----------+--------------------------------------------------+--------------+
|         ‚Äî|Pinniped Projects: Articulating Seal and Sea Li...|           5.0|
|         ‚Äî|               HUF Mechanics Pullover Hoodie Black|           5.0|
|from 29.99|  Understanding Colorectal Cancer Anatomical Chart|           5.0|
|         ‚Äî|Anno Womens Bleach Friendly Surgical Cap Scrub ...|           2.9|
|         ‚Äî|HobbyKing Compact 20A Watt Meter and Servo Powe...|           5.0|
|         ‚Äî|  1/24 „Ç™„Éº„Éä„Éº„Ç∫24 No.12 '73 „ÉÄ„ÉÉ„Ç∏„ÉÅ„É£„É¨„É≥„Ç∏„É£„Éº|           4.7|
|         ‚Äî|Chemistry Equations & Answers Laminate Referenc...|           5.0|
| from 9.98|        Handbook, DOT Hazmat Requirements, English|           4.7|
|         ‚Äî|ACLS (Advanced Cardiac Life Support) Survival C...|           4.3|
|         ‚Äî|La Biblic



+---------+--------------------------------------------------+--------------+
|    price|                                             title|average_rating|
+---------+--------------------------------------------------+--------------+
|129266.64|Senco 08S250W592 2-1/2" x #8 Duraspin Collated ...|           3.5|
|124024.12|Merit Glue Bond Refill for 350-RP UNSCORED, Alu...|           4.0|
|  49999.0|JG MAKER Industrial SLA 3D Printer JG-A600 Larg...|           5.0|
| 32999.99|On/Go One COVID-19 Rapid Antigen Home Test, 1 P...|           5.0|
| 30769.65|                 Weight Set(20), 50 kg-1 g, ASTM 1|           5.0|
| 25666.25|     SPX 2 JAWPULLER, 100 TON, Universal (PH1002J)|           5.0|
| 21385.47|GOLEHS Osmium (Os) Density Cube, Laboratory-Gra...|           1.0|
| 15334.52|Starrett 123Z-72 Vernier Caliper, Steel, Nib St...|           3.0|
| 15183.99|OTC 1854 100-Ton Capacity Shop Press with Elect...|           5.0|
| 14530.99|Fluke Networks DSX2-8000 CableAnalyzer Copper C...|  

                                                                                

In [34]:
print("\n4. Checking invalid average ratings in metadata...")

invalid_avg = metadata_cleaned.filter(
    (col("average_rating") < 0) |
    (col("average_rating") > 5)
).count()

print(f"Invalid average ratings: {invalid_avg}")

if invalid_avg > 0:
    print("\n‚ö†Ô∏è  WARNING: Found invalid average ratings!")
    metadata_cleaned.filter(
        (col("average_rating") < 0) |
        (col("average_rating") > 5)
    ).select("parent_asin", "title", "average_rating", "rating_number").show(10, truncate=False)
    print("\nüí° Cleaning Decision: These should be removed or corrected.")
else:
    print("‚úì All average ratings are within valid range (0.0 - 5.0)")
    
# Additional check: products with 0 rating_number but non-null average_rating
inconsistent_ratings = metadata_cleaned.filter(
    (col("rating_number") == 0) &
    col("average_rating").isNotNull()
).count()
print(f"\nProducts with 0 rating_number but non-null average_rating: {inconsistent_ratings}")
if inconsistent_ratings > 0:
    print("üí° Cleaning Decision: These may need review - rating_number should match average_rating presence.")



4. Checking invalid average ratings in metadata...


                                                                                

Invalid average ratings: 0
‚úì All average ratings are within valid range (0.0 - 5.0)


[Stage 223:>                                                      (0 + 12) / 12]


Products with 0 rating_number but non-null average_rating: 0


                                                                                

In [35]:
print("\n" + "=" * 70)
print("SAVING CLEANED DATA")
print("=" * 70)

# Ensure cleaned directories exist
cleaned_reviews_dir = ROOT_DIR / "data/cleaned/review_categories"
cleaned_meta_dir = ROOT_DIR / "data/cleaned/meta_categories"
cleaned_reviews_dir.mkdir(parents=True, exist_ok=True)
cleaned_meta_dir.mkdir(parents=True, exist_ok=True)

CLEANED_REVIEWS_PATH = cleaned_reviews_dir / "industrial_and_scientific_reviews_cleaned.parquet"
CLEANED_METADATA_PATH = cleaned_meta_dir / "industrial_and_scientific_metadata_cleaned.parquet"

print(f"\nSaving cleaned reviews to: {CLEANED_REVIEWS_PATH}")
reviews_cleaned.write.mode("overwrite").parquet(str(CLEANED_REVIEWS_PATH))
print("‚úì Reviews saved successfully")

print(f"\nSaving cleaned metadata to: {CLEANED_METADATA_PATH}")
metadata_cleaned.write.mode("overwrite").parquet(str(CLEANED_METADATA_PATH))
print("‚úì Metadata saved successfully")

print("\n" + "=" * 70)
print("FINAL CLEANING STATISTICS")
print("=" * 70)

reviews_removed = original_review_count - final_review_count
meta_removed = original_meta_count - cleaned_meta_count
total_removed = reviews_removed + meta_removed

print(f"\nREVIEWS:")
print(f"  Original:     {original_review_count:>12,}")
print(f"  Cleaned:      {final_review_count:>12,}")
print(f"  Removed:      {reviews_removed:>12,} ({(reviews_removed/original_review_count*100):.2f}%)")

print(f"\nMETADATA:")
print(f"  Original:     {original_meta_count:>12,}")
print(f"  Cleaned:      {cleaned_meta_count:>12,}")
print(f"  Removed:      {meta_removed:>12,} ({(meta_removed/original_meta_count*100):.2f}%)")

print(f"\nTOTAL:")
print(f"  Records removed: {total_removed:>12,}")
print(f"  Data retention:   {(100 - (total_removed/(original_review_count + original_meta_count)*100)):>11.2f}%")

print("\n" + "=" * 70)
print("‚úì Data cleaning completed successfully!")
print("=" * 70)



SAVING CLEANED DATA

Saving cleaned reviews to: /Users/andriimyrosh/Projects/amazon-reviews-analysis/data/cleaned/review_categories/industrial_and_scientific_reviews_cleaned.parquet


                                                                                

‚úì Reviews saved successfully

Saving cleaned metadata to: /Users/andriimyrosh/Projects/amazon-reviews-analysis/data/cleaned/meta_categories/industrial_and_scientific_metadata_cleaned.parquet




‚úì Metadata saved successfully

FINAL CLEANING STATISTICS

REVIEWS:
  Original:        5,183,005
  Cleaned:         5,124,927
  Removed:            58,078 (1.12%)

METADATA:
  Original:          427,564
  Cleaned:           427,564
  Removed:                 0 (0.00%)

TOTAL:
  Records removed:       58,078
  Data retention:         98.96%

‚úì Data cleaning completed successfully!


                                                                                