In [3]:
from collections import defaultdict
import os
import wget
import gzip
from tqdm.notebook import tqdm

In [5]:
base_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/"
reviews_path = "reviews_Beauty.json.gz"
meta_path = "meta_Beauty.json.gz"

# download raw dataset
if not os.path.exists(reviews_path):
    wget.download(base_url + reviews_path)
if not os.path.exists(meta_path):
    wget.download(base_url + meta_path)

100% [........................................................................] 99156148 / 99156148

In [7]:
ITEM_FREQ_MIN = 5
REVIEWS_REMOVE_LESS_THAN = 5

out_path = "Beauty.txt"
id_to_title_map_path = "Beauty-titles.txt"
id_to_asin_map_path = "Beauty-id_to_asin.txt"
train_item_freq_path = "Beauty-train_item_freq.txt"

In [11]:
# load items data
items = dict()
skipped = 0
with gzip.open(meta_path, "r") as f:
    for line in tqdm(f):
        json_obj = eval(line)
        asin = json_obj['asin']
        if 'title' in json_obj:
            title = json_obj['title'].replace("\"", "'")
            title = title.replace("\n", " ")
            if len(title) >= 2:
                items[asin] = title
            else:
                skipped +=1
        else:
            skipped += 1

print(f"Found {len(items)} items")
print(f"Skipped {skipped} items without title")

# load reviews data
reviews = defaultdict(list)
item_freq = defaultdict(int)
skipped = 0
with gzip.open(reviews_path, "r") as f:
    for line in tqdm(f):
        json_obj = eval(line)
        user_id = json_obj['reviewerID']
        asin = json_obj['asin']
        timestemp = json_obj['unixReviewTime']
        if asin in items:
            reviews[user_id].append((asin, int(timestemp)))
            item_freq[asin] += 1
        else:
            skipped += 1
            # print(f"skipped {asin}")

print(f"Found {len(reviews)} users")
print(f"Found {sum(item_freq.values())} reviews")
print(f"Skipepd {skipped} item reviews without metadata")

item_freq = {k: v for k, v in item_freq.items() if v >= ITEM_FREQ_MIN}

item_freq = dict(sorted(item_freq.items()))

# remove user with less than K reviews
removed_users_less_than = 0
removed_users_item_less_than = 0
removed_items = 0
updated_items = set()
for user_id in list(reviews.keys()):
    if len(reviews[user_id]) < REVIEWS_REMOVE_LESS_THAN:
        del reviews[user_id]
        removed_users_less_than += 1
    else:
        len_before = len(reviews[user_id])
        reviews[user_id] = [item for item in reviews[user_id] if item[0] in item_freq]
        updated_items.update([t[0] for t in reviews[user_id]])
        removed_items += len_before - len(reviews[user_id])
        if len(reviews[user_id]) <= 0:
            del reviews[user_id]
            removed_users_item_less_than += 1
print(f"Removed {removed_items} reviews of items that appear less than {ITEM_FREQ_MIN} in total")
print(f"Removed {removed_users_less_than} users with less than {REVIEWS_REMOVE_LESS_THAN} actions")
print(f"Removed {removed_users_item_less_than} users with only item count less than {REVIEWS_REMOVE_LESS_THAN}")

# calculate item frequencey again 
original_item_freq = item_freq
item_freq = defaultdict(int)
for user_id, rating_list in reviews.items():
    for item, timestamp in rating_list:
        item_freq[item] += 1
        
item_freq = dict(sorted(item_freq.items()))
print(f"Total of {sum(item_freq.values())} reviews")

# remove "unused" items
new_items = {}
new_item_freq = {}
new_original_item_freq = {}
for asin in tqdm(updated_items):
    new_items[asin] = items[asin]
    new_item_freq[asin] = item_freq[asin]
    new_original_item_freq[asin] = original_item_freq[asin]
print(f"Removed {len(items) - len(new_items)} items that are not been reviewd")
item_freq = new_item_freq
items = new_items
original_item_freq = new_original_item_freq


print()
print(f"Items   Reviews   Users")
print(f"{len(items):<4}   {sum(len(v) for v in reviews.values()):<7}   {len(reviews):<5}")

# fix user id
user_id_mapping = dict()
i = 0
for original_user_id in reviews:
    user_id_mapping[original_user_id] = i
    i += 1

# fix items ids
item_id_mapping = dict()
i = 0
for asin in items:
    item_id_mapping[asin] = i
    i += 1

train_item_freq = {k: 0 for k in item_freq.keys()}
val_item_freq = {k: 0 for k in item_freq.keys()}
test_item_freq = {k: 0 for k in item_freq.keys()}
for user_id, rating_list in reviews.items():
    sorted_list = list(map(lambda t: t[0], sorted(rating_list, key=lambda t: t[1])))
    if len(sorted_list) < 3:
        train_list = sorted_list
    else:
        train_list = sorted_list[1:-2]
        val_item_freq[sorted_list[-2]] += 1
        test_item_freq[sorted_list[-1]] += 1    
    for asin in train_list:
        train_item_freq[asin] += 1
 

with open(out_path, "w") as f:
    for user_id, rating_list in reviews.items():
        sorted_list = sorted(rating_list, key=lambda t: t[1])
        for asin, timestamp in sorted_list:
            f.write(f"{user_id_mapping[user_id] + 1} {item_id_mapping[asin] + 1}\n") # start user id from 1 to match original SASRec paper,reserve the 0 index for padding

with open(id_to_title_map_path, "w") as f:
    for asin, title in items.items():
        f.write(f'{item_id_mapping[asin]} "{title}"\n')

with open(id_to_asin_map_path, "w") as f:
    for asin, item_id in item_id_mapping.items():
        f.write(f'{item_id} {asin}\n')

with open(train_item_freq_path, "w") as f:
    for asin, count in train_item_freq.items():
        f.write(f'{item_id_mapping[asin]} {count}\n')

0it [00:00, ?it/s]

Found 258757 items
Skipped 447 items without title


0it [00:00, ?it/s]

Found 1208831 users
Found 2020639 reviews
Skipepd 2431 item reviews without metadata
Removed 74754 reviews of items that appear less than 5 in total
Removed 1156529 users with less than 5 actions
Removed 169 users with only item count less than 5
Total of 394348 reviews


  0%|          | 0/57226 [00:00<?, ?it/s]

Removed 201531 items that are not been reviewd

Items   Reviews   Users
57226   394348    52133
