In [None]:
import os
import json
import statistics
from collections import Counter

In [None]:
# Create the data/02_postprocessing direction if it does not exist
if not os.path.exists("data/02_postprocessing"):
    os.makedirs("data/02_postprocessing")
# Create the data/02_postprocessing/sample directory if it does not exist
if not os.path.exists("data/02_postprocessing/sample"):
    os.makedirs("data/02_postprocessing/sample")

In [None]:
SAMPLE = False

## Rating Standardization

In [None]:
# [!] 4 minutes to load
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}reviews.json") as f:
    reviews_original = [json.loads(line) for line in f]
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.json") as f:
    users_original = [json.loads(line) for line in f]

In [None]:
user_reviews_dict = {}
for review in reviews_original:
    user_id = review["user_id"]
    if user_id not in user_reviews_dict.keys():
        user_reviews_dict[user_id] = {"reviews": [review], "ratings": [(review["review_id"], review["stars"])]}
    else:
        user_reviews_dict[user_id]["reviews"].append(review)
        user_reviews_dict[user_id]["ratings"].append((review["review_id"], review["stars"]))

In [None]:
# Used to determine the cut off for how many reviews a user should have before we
# apply z-score normalization

# This outputs the ratio between the number of reviews that would be standardized
# and all reviews in the dataset

cutoff = 5
num_reviews = [len(user["reviews"]) for user in user_reviews_dict.values()]
review_count = Counter(num_reviews)
review_counter_agg = [(tup[0], tup[1], tup[0] * tup[1]) for tup in review_count.items()]
ratio = sum([tup[2] for tup in review_counter_agg if tup[0] >= cutoff]) / sum([tup[2] for tup in review_counter_agg])
ratio

In [None]:
users_not_standardized = []
users_standardized = []
for user_id, user_reviews in user_reviews_dict.items():
    if len(user_reviews["ratings"]) >= cutoff and len(set(list(zip(*user_reviews["ratings"]))[1])) >= 2:
        users_standardized.append(user_id)
        user_reviews["avg_rating"] = statistics.mean(list(zip(*user_reviews["ratings"]))[1])
        user_reviews["std_dev"] = statistics.stdev(list(zip(*user_reviews["ratings"]))[1])
        user_reviews["standardized_ratings"] = [(rating[0], (rating[1] - user_reviews["avg_rating"]) / user_reviews["std_dev"]) for rating in user_reviews["ratings"]]
    else:
        users_not_standardized.append(user_id)
        
# standardize all other ratings with respect to each other
all_ratings = [rating for user_reviews in [user_reviews_dict[user_id] for user_id in users_not_standardized] for rating in user_reviews["ratings"]]
mean = statistics.mean(list(zip(*all_ratings))[1])
std_dev = statistics.stdev(list(zip(*all_ratings))[1])
pool_standardized_ratings = [(rating[0], (rating[1] - mean) / std_dev) for rating in all_ratings]

# Gather all ratings standardized by user
user_standardized_ratings = [rating for user_reviews in [user_review for user_review in user_reviews_dict.values() if "standardized_ratings" in user_review.keys()] for rating in user_reviews["standardized_ratings"]]

In [None]:
print(f"Number of users: {len(user_reviews_dict)}")
print(f"Number of users with at least {cutoff} reviews: {len(users_standardized)}")
print(f"Number of users with less than {cutoff} reviews: {len(users_not_standardized)}")
print(f"Number of ratings that were user-standardized: {len(user_standardized_ratings)}")
print(f"Number of ratings that were pool-standardized: {len(pool_standardized_ratings)}")

In [None]:
reviews = {review["review_id"]: review for review in reviews_original}
for rating in user_standardized_ratings + pool_standardized_ratings:
    reviews[rating[0]]["standardized_rating"] = rating[1]

In [None]:
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json", "w") as f:
    f.write("")
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json", "a") as f:
    for review in reviews.values():
        f.write(json.dumps(review) + "\n")