In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct
from bs4 import BeautifulSoup

In [None]:
AMAZON_REVIEWS = 'amazon_reviews'
AMAZON_METADATA = 'amazon_metadata'

spark = SparkSession.builder.getOrCreate()

Number `.gz` review files: 2738

Number `.gz` metadata files: 1503

In [3]:
review_files = [os.path.join(AMAZON_REVIEWS, path) for path in os.listdir(AMAZON_REVIEWS)]
metadata_files = [os.path.join(AMAZON_METADATA, path) for path in os.listdir(AMAZON_METADATA)]

# print(f'Number review files: {len(review_files)}')
# print(f'Number metadata files: {len(metadata_files)}')

In [None]:
num_review_files = 2738
num_metadata_files = 1503

df_reviews = spark.read.json(review_files[:num_review_files])
df_metadata = spark.read.json(metadata_files[:num_metadata_files])

Number of reviews: 139,832,500

Number of metadata: 15,023,059

In [5]:
num_reviews = 139832500
num_metadata = 15023059
# num_reviews = df_reviews.count()
# num_metadata = df_metadata.count()

# print(f'Number of reviews: {num_reviews}')
# print(f'Number of metadata: {num_metadata}')

In [6]:
# df_reviews.show()
print(df_reviews.columns)

['asin', 'image', 'overall', 'reviewText', 'reviewerID', 'reviewerName', 'style', 'summary', 'unixReviewTime', 'verified', 'vote']


In [7]:
# df_metadata.show()
print(df_metadata.columns)

['also_buy', 'also_view', 'asin', 'brand', 'category', 'date', 'description', 'details', 'feature', 'fit', 'image', 'main_cat', 'price', 'rank', 'similar_item', 'tech1', 'tech2', 'title']


## Explore reviews

Check for duplications. Removed `reviewerName` from the analysis.

In [35]:
selected_colums = ['asin', 'image', 'overall', 'reviewText', 'reviewerID', 'style', 'summary', 'unixReviewTime', 'verified', 'vote']
df_reviews_new = df_reviews.dropDuplicates(selected_colums)
# num_unique_reviews = df_reviews_new.count()
num_unique_reviews = 138482432

There are 138,482,432 unique products id, 1,350,068 duplications.

In [9]:
# num_duplications = num_reviews - num_unique_reviews
# print(f"There are {num_unique_reviews} unique products id, {num_duplications} duplications.")

There are 138482432 unique products id, 1350068 duplications.


## Explore metadata

There are multiple entries for the same product id.

The value of te columns of duplicated ids are the same.

In [10]:
# df_metadata.select(countDistinct('asin')).show()
# df_metadata.distinct().count() # Compare all columns, slower, same result.

Remove duplicates

In [31]:
selected_colums = ['asin']
df_metadata_new = df_metadata.dropDuplicates(selected_colums)
# num_unique_metadata = df_metadata_new.count()
num_unique_metadata = 14741571

There are 14,741,571 unique products id, 281,488 duplications.

In [12]:
# num_duplications = num_metadata - num_unique_metadata
# print(f"There are {num_unique_metadata} unique products id, {num_duplications} duplications.")

## Query to get number of reviews by category

In [15]:
df_reviews_cat = df_reviews_new.join(df_metadata_new, df_reviews_new.asin == df_metadata_new.asin).select(
    df_reviews_new.asin, df_metadata_new.main_cat)

In [None]:
cat_count_rows = df_reviews_cat.groupBy('main_cat').count().collect()
cat_count = {cat['main_cat']: cat['count'] for cat in cat_count_rows}

In [83]:
def get_cat_count_proc(cat_count):
    cat_count_proc = {}

    for k, v in cat_count.items():
        if k.startswith('<'):
            new_k = BeautifulSoup(k).img['alt'].title()
            cat_count_proc[new_k] = cat_count_proc.get(new_k, 0) + v
        elif('&amp;' in k):
            new_k = k.replace('&amp;', '&')
            cat_count_proc[new_k] = cat_count_proc.get(new_k, 0) + v
        else:
            cat_count_proc[k] = v

    return cat_count_proc

cat_count_proc = get_cat_count_proc(cat_count)
assert sum(cat_count.values()) == sum(cat_count_proc.values())

In [89]:
dict(sorted(cat_count_proc.items(), key=lambda x:x[1], reverse=True))

{'Books': 48073131,
 'Amazon Fashion': 21367039,
 'Amazon Home': 11169687,
 'Sports & Outdoors': 6398024,
 'Buy a Kindle': 6085927,
 'Movies & TV': 5727439,
 'Tools & Home Improvement': 4544184,
 'Cell Phones & Accessories': 3838068,
 'Digital Music': 3641863,
 'Automotive': 3551905,
 'Toys & Games': 2891662,
 'Grocery': 2252131,
 'Office Products': 2226526,
 'Pet Supplies': 2106744,
 'All Electronics': 2083045,
 'Computers': 1671578,
 'Arts, Crafts & Sewing': 1437428,
 'Home Audio & Theater': 1385618,
 'Video Games': 1300090,
 'Camera & Photo': 1033416,
 'Industrial & Scientific': 832065,
 'Health & Personal Care': 753424,
 'Luxury Beauty': 465270,
 'Prime Pantry': 450280,
 '': 405417,
 'All Beauty': 364554,
 'Software': 312193,
 'Musical Instruments': 287174,
 'Baby': 274627,
 'Car Electronics': 208936,
 'Audible Audiobooks': 146471,
 'Amazon Devices': 138597,
 'Gift Cards': 100165,
 'Vehicles': 60559,
 'Appliances': 52204,
 'Portable Audio & Accessories': 42937,
 'Alexa Skills': 230

In [36]:
num_reviews_with_product_metadata = sum(cat_count.values())
num_reviews_without_product_metadata = num_unique_reviews - num_reviews_with_product_metadata

print(f'Number of reviews with product metadata: {num_reviews_with_product_metadata}')
print(f'Number of reviews without product metadata: {num_reviews_without_product_metadata}')

Number of reviews with product metadata: 137759876
Number of reviews without product metadata: 722556
