In [None]:
import os
import json
import statistics
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
nltk.download("punkt")

In [None]:
# Create the data/02_postprocessing direction if it does not exist
if not os.path.exists("data/02_postprocessing"):
    os.makedirs("data/02_postprocessing")
# Create the data/02_postprocessing/sample directory if it does not exist
if not os.path.exists("data/02_postprocessing/sample"):
    os.makedirs("data/02_postprocessing/sample")

In [None]:
def silent_remove(filename):
    try:
        os.remove(filename)
    except OSError:
        pass

This notebook is designed so that it can be run on the sample data or the full data just by switching one variable.

In [None]:
# Set this to True to run the script on the sample data
# Set this to False to run the script on the full data (takes much longer)
SAMPLE = False

## Ratings

This section performs rating normalization (technically standardization) to all ratings

In [None]:
# [!] 4 minutes to load
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}ratings.json") as f:
    ratings_original = [json.loads(line) for line in f]
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.json") as f:
    users_original = [json.loads(line) for line in f]

In [None]:
user_ratings_dict = {}
for rating in ratings_original:
    user_id = rating["user_id"]
    if user_id not in user_ratings_dict.keys():
        user_ratings_dict[user_id] = {"rating_objs": [rating], "ratings": [(rating["rating_id"], rating["stars"])]}
    else:
        user_ratings_dict[user_id]["rating_objs"].append(rating)
        user_ratings_dict[user_id]["ratings"].append((rating["rating_id"], rating["stars"]))

In [None]:
# Used to determine the cut off for how many ratings a user should have before we
# apply z-score normalization

# This outputs the ratio between the number of ratings that would be standardized
# and all ratings in the dataset

cutoff = 5
num_ratings = [len(user["ratings"]) for user in user_ratings_dict.values()]
rating_count = Counter(num_ratings)
rating_counter_agg = [(tup[0], tup[1], tup[0] * tup[1]) for tup in rating_count.items()]
ratio = sum([tup[2] for tup in rating_counter_agg if tup[0] >= cutoff]) / sum([tup[2] for tup in rating_counter_agg])
ratio

In [None]:
users_not_standardized = []
users_standardized = []
for user_id, user_ratings in user_ratings_dict.items():
    if (
        len(user_ratings["ratings"]) >= cutoff
        and len(set(list(zip(*user_ratings["ratings"]))[1])) >= 2
    ):
        users_standardized.append(user_id)
        user_ratings["avg_rating"] = statistics.mean(
            list(zip(*user_ratings["ratings"]))[1]
        )
        user_ratings["std_dev"] = statistics.stdev(
            list(zip(*user_ratings["ratings"]))[1]
        )
        user_ratings["standardized_ratings"] = [
            (
                rating[0],
                (rating[1] - user_ratings["avg_rating"]) / user_ratings["std_dev"],
            )
            for rating in user_ratings["ratings"]
        ]
    else:
        users_not_standardized.append(user_id)

# Gather all ratings standardized by user
user_standardized_ratings = [
    rating
    for user_ratings in [
        user_rating
        for user_rating in user_ratings_dict.values()
        if "standardized_ratings" in user_rating.keys()
    ]
    for rating in user_ratings["standardized_ratings"]
]

In [None]:
# standardize all other ratings with respect to each other
all_ratings = [
    rating
    for user_ratings in [
        user_ratings_dict[user_id] for user_id in users_not_standardized
    ]
    for rating in user_ratings["ratings"]
]
mean = statistics.mean(list(zip(*all_ratings))[1])
std_dev = statistics.stdev(list(zip(*all_ratings))[1])
pool_standardized_ratings = [
    (rating[0], (rating[1] - mean) / std_dev) for rating in all_ratings
]

In [None]:
print(f"Number of users: {len(user_ratings_dict)}")
print(f"Number of users with at least {cutoff} ratings: {len(users_standardized)}")
print(f"Number of users with less than {cutoff} ratings: {len(users_not_standardized)}")
print(f"Number of ratings that were user-standardized: {len(user_standardized_ratings)}")
print(f"Number of ratings that were pool-standardized: {len(pool_standardized_ratings)}")

In [None]:
ratings = {rating["rating_id"]: rating for rating in ratings_original}
for rating in user_standardized_ratings + pool_standardized_ratings:
    ratings[rating[0]]["standardized_rating"] = rating[1]

In [None]:
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}ratings.json", "w") as f:
    f.write("")
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}ratings.json", "a") as f:
    for rating in ratings.values():
        f.write(json.dumps(rating) + "\n")

## Businesses

### Categories >> Type, Keywords

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json") as f:
    businesses = [json.loads(line) for line in f]

In [None]:
# Count most frequently category occurrences in businesses.json
category_counts = {}
for business in businesses:
    categories_str = business.get("categories", '')
    if categories_str:
        categories = categories_str.split(', ')
        for category in categories:
            category_counts[category] = category_counts.get(category, 0) + 1

# Find the most frequent categories
category_counts = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
category_counts[:5]

# Create a list of the most frequent categories
top_categories = [category[0] for category in category_counts[:5]]

# print the top 5 most frequent categories
top_categories

In [None]:
# Keep only the businesses that have at least one of the top 5 categories using a dictionary and create a new attribute "type" for each business that states which of the top 5 categories it belongs to
businesses_dict = {}
for business in businesses:
    categories_str = business.get("categories", '')
    if categories_str:
        categories = categories_str.split(', ')
        for category in categories:
            if category in top_categories:
                business["type"] = category
                businesses_dict[business["business_id"]] = business
                break

In [None]:
silent_remove(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json")
with open(f"data/01_cleaned/{'sample' if SAMPLE else ''}/businesses.json", "a") as f:
    for business in businesses_dict.values():
        f.write(json.dumps(business) + "\n")

### Keyword Extraction

In [None]:
stop_words = set(stopwords.words("english"))

# read business JSON file
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json") as f:
    businesses = [json.loads(line) for line in f]

# Extract keywords from each business entry for category and name attributes using dictionary
business_keywords = {}
for business in businesses:
    business_keywords[business["business_id"]] = {}
    business_keywords[business["business_id"]]["category"] = []
    business_keywords[business["business_id"]]["name"] = []
    for word in word_tokenize(business["categories"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            business_keywords[business["business_id"]]["category"].append(word)
    for word in word_tokenize(business["name"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            business_keywords[business["business_id"]]["name"].append(word)

# read review JSON file
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json") as f:
    reviews = [json.loads(line) for line in f]

# Extract keywords from each review entry for text attribute using dictionary
review_keywords = {}
for review in reviews:
    review_keywords[review["review_id"]] = []
    for word in word_tokenize(review["text"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            review_keywords[review["review_id"]].append(word)

# Create a dictionary of business_id to a list of review_ids
business_review_dict = {}
for review in reviews:
    if review["business_id"] not in business_review_dict.keys():
        business_review_dict[review["business_id"]] = [review["review_id"]]
    else:
        business_review_dict[review["business_id"]].append(review["review_id"])

# combine keywords from all reviews, category, and name for each business
# some businesses do not have any reviews, so we only combine category and name keywords for those businesses using dictionary
business_keywords_combined = {}
for business in businesses:
    business_keywords_combined[business["business_id"]] = []
    if business["business_id"] in business_review_dict.keys():
        for review_id in business_review_dict[business["business_id"]]:
            business_keywords_combined[business["business_id"]] += review_keywords[review_id]
    business_keywords_combined[business["business_id"]] += business_keywords[business["business_id"]]["category"]
    business_keywords_combined[business["business_id"]] += business_keywords[business["business_id"]]["name"]



# print the keywords for each business with their name
for business in businesses:
    print(business["name"] + ": ")
    print(business_keywords_combined[business["business_id"]])
    print("\n")


