In [30]:
import os
import json
import statistics
from collections import Counter
import sys
!{sys.executable} -m pip install nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
nltk.download("punkt")





[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anthonywang64/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anthonywang64/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# Create the data/02_postprocessing direction if it does not exist
if not os.path.exists("data/02_postprocessing"):
    os.makedirs("data/02_postprocessing")
# Create the data/02_postprocessing/sample directory if it does not exist
if not os.path.exists("data/02_postprocessing/sample"):
    os.makedirs("data/02_postprocessing/sample")

In [21]:
SAMPLE = True

## Rating Standardization

In [22]:
# [!] 4 minutes to load
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}reviews.json") as f:
    reviews_original = [json.loads(line) for line in f]
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.json") as f:
    users_original = [json.loads(line) for line in f]

In [23]:
user_reviews_dict = {}
for review in reviews_original:
    user_id = review["user_id"]
    if user_id not in user_reviews_dict.keys():
        user_reviews_dict[user_id] = {"reviews": [review], "ratings": [(review["review_id"], review["stars"])]}
    else:
        user_reviews_dict[user_id]["reviews"].append(review)
        user_reviews_dict[user_id]["ratings"].append((review["review_id"], review["stars"]))

In [24]:
# Used to determine the cut off for how many reviews a user should have before we
# apply z-score normalization

# This outputs the ratio between the number of reviews that would be standardized
# and all reviews in the dataset

cutoff = 5
num_reviews = [len(user["reviews"]) for user in user_reviews_dict.values()]
review_count = Counter(num_reviews)
review_counter_agg = [(tup[0], tup[1], tup[0] * tup[1]) for tup in review_count.items()]
ratio = sum([tup[2] for tup in review_counter_agg if tup[0] >= cutoff]) / sum([tup[2] for tup in review_counter_agg])
ratio

0.03509198592314955

In [25]:
users_not_standardized = []
users_standardized = []
for user_id, user_reviews in user_reviews_dict.items():
    if len(user_reviews["ratings"]) >= cutoff and len(set(list(zip(*user_reviews["ratings"]))[1])) >= 2:
        users_standardized.append(user_id)
        user_reviews["avg_rating"] = statistics.mean(list(zip(*user_reviews["ratings"]))[1])
        user_reviews["std_dev"] = statistics.stdev(list(zip(*user_reviews["ratings"]))[1])
        user_reviews["standardized_ratings"] = [(rating[0], (rating[1] - user_reviews["avg_rating"]) / user_reviews["std_dev"]) for rating in user_reviews["ratings"]]
    else:
        users_not_standardized.append(user_id)
        
# standardize all other ratings with respect to each other
all_ratings = [rating for user_reviews in [user_reviews_dict[user_id] for user_id in users_not_standardized] for rating in user_reviews["ratings"]]
mean = statistics.mean(list(zip(*all_ratings))[1])
std_dev = statistics.stdev(list(zip(*all_ratings))[1])
pool_standardized_ratings = [(rating[0], (rating[1] - mean) / std_dev) for rating in all_ratings]

# Gather all ratings standardized by user
user_standardized_ratings = [rating for user_reviews in [user_review for user_review in user_reviews_dict.values() if "standardized_ratings" in user_review.keys()] for rating in user_reviews["standardized_ratings"]]

In [26]:
print(f"Number of users: {len(user_reviews_dict)}")
print(f"Number of users with at least {cutoff} reviews: {len(users_standardized)}")
print(f"Number of users with less than {cutoff} reviews: {len(users_not_standardized)}")
print(f"Number of ratings that were user-standardized: {len(user_standardized_ratings)}")
print(f"Number of ratings that were pool-standardized: {len(pool_standardized_ratings)}")

Number of users: 61175
Number of users with at least 5 reviews: 358
Number of users with less than 5 reviews: 60817
Number of ratings that were user-standardized: 2413
Number of ratings that were pool-standardized: 67489


In [27]:
reviews = {review["review_id"]: review for review in reviews_original}
for rating in user_standardized_ratings + pool_standardized_ratings:
    reviews[rating[0]]["standardized_rating"] = rating[1]

In [28]:
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json", "w") as f:
    f.write("")
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json", "a") as f:
    for review in reviews.values():
        f.write(json.dumps(review) + "\n")

## Keyword Extraction

In [40]:
stop_words = set(stopwords.words("english"))

# read business JSON file
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json") as f:
    businesses = [json.loads(line) for line in f]

# Extract keywords from each business entry for category and name attributes using dictionary
business_keywords = {}
for business in businesses:
    business_keywords[business["business_id"]] = {}
    business_keywords[business["business_id"]]["category"] = []
    business_keywords[business["business_id"]]["name"] = []
    for word in word_tokenize(business["categories"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            business_keywords[business["business_id"]]["category"].append(word)
    for word in word_tokenize(business["name"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            business_keywords[business["business_id"]]["name"].append(word)

# read review JSON file
with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}reviews.json") as f:
    reviews = [json.loads(line) for line in f]

# Extract keywords from each review entry for text attribute using dictionary
review_keywords = {}
for review in reviews:
    review_keywords[review["review_id"]] = []
    for word in word_tokenize(review["text"]):
        if word not in stop_words and word != "," and word != "&" and word != "-" and word != "(" and word != ")" and word != "." and word != "'" and word != "!" and word != "?" and word != ":" and word != ";" and word != "[" and word != "]" and word != "/":
            review_keywords[review["review_id"]].append(word)

# Create a dictionary of business_id to a list of review_ids
business_review_dict = {}
for review in reviews:
    if review["business_id"] not in business_review_dict.keys():
        business_review_dict[review["business_id"]] = [review["review_id"]]
    else:
        business_review_dict[review["business_id"]].append(review["review_id"])

# combine keywords from all reviews, category, and name for each business
# some businesses do not have any reviews, so we only combine category and name keywords for those businesses using dictionary
business_keywords_combined = {}
for business in businesses:
    business_keywords_combined[business["business_id"]] = []
    if business["business_id"] in business_review_dict.keys():
        for review_id in business_review_dict[business["business_id"]]:
            business_keywords_combined[business["business_id"]] += review_keywords[review_id]
    business_keywords_combined[business["business_id"]] += business_keywords[business["business_id"]]["category"]
    business_keywords_combined[business["business_id"]] += business_keywords[business["business_id"]]["name"]



# print the keywords for each business with their name
for business in businesses:
    print(business["name"] + ": ")
    print(business_keywords_combined[business["business_id"]])
    print("\n")




Famous Footwear: 
['Sporting', 'Goods', 'Fashion', 'Shoe', 'Stores', 'Shopping', 'Sports', 'Wear', 'Accessories', 'Famous', 'Footwear']


Pathmark: 
['Food', 'Grocery', 'Pathmark']


Sunoco: 
['Restaurants', 'Delis', 'Sandwiches', 'Sunoco']


Indy Soft Water: 
['Local', 'Services', 'Water', 'Purification', 'Services', 'Plumbing', 'Water', 'Suppliers', 'Utilities', 'Home', 'Services', 'Indy', 'Soft', 'Water']


New Orleans Hamburger & Seafood Co.: 
['This', 'place', 'surprised', 'I', "n't", 'expecting', 'like', 'much', 'I', "'ve", 'never', 'thin', 'cat', 'fish', 'good', 'And', 'hush', 'puppies', 'delicious', 'The', 'people', 'friendly', 'get', 'free', 'ice', 'cream', 'cones', 'They', 'put', 'lot', 'cheese', 'cheese', 'burgers', 'amazing', 'I', 'really', 'feel', 'get', 'money', "'s", 'worth', 'We', 'came', 'two', 'times', 'vacation', 'Fish', 'Chips', 'Burgers', 'Seafood', 'Restaurants', 'New', 'Orleans', 'Hamburger', 'Seafood', 'Co']


Taqueria La Hacienda: 
['Latin', 'American', 'Restau