In [None]:
import re
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

_ = nltk.download("stopwords", quiet=True)
_ = nltk.download("punkt", quiet=True)

In [None]:
# Create the data/02_postprocessing direction if it does not exist
if not os.path.exists("data/02_postprocessing"):
    os.makedirs("data/02_postprocessing")
# Create the data/02_postprocessing/sample directory if it does not exist
if not os.path.exists("data/02_postprocessing/sample"):
    os.makedirs("data/02_postprocessing/sample")

In [None]:
def silent_remove(filename):
    try:
        os.remove(filename)
    except OSError:
        pass

This notebook is designed so that it can be run on the sample data or the full data just by switching one variable.

In [None]:
# Set this to True to run the script on the sample data
# Set this to False to run the script on the full data (takes much longer)
SAMPLE = False

## Ratings

This section performs rating normalization (technically standardization) to all ratings

In [None]:
# Load ratings and users data
file_name = f"data/01_cleaned/{'sample/' if SAMPLE else ''}ratings.csv"
ratings = pd.read_csv(file_name, header=0)
file_name = f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.csv"
users = pd.read_csv(file_name, header=0)

In [None]:
# Index on the "user_id" and "rating_id" columns
ratings = ratings.set_index(["user_id", "rating_id"])
# Index on the "user_id" column for the users data
users = users.set_index("user_id")

In [None]:
# Find the number of ratings for each user_id in the ratings data frame
user_ratings = ratings.groupby("user_id").agg(
    num_ratings = pd.NamedAgg(column="stars", aggfunc="count"),
    mean_rating = pd.NamedAgg(column="stars", aggfunc="mean"),
    std_rating = pd.NamedAgg(column="stars", aggfunc="std"),
)
user_ratings.head()

In [None]:
# Add a "num_ratings" to the users data frame by joining the users and user_ratings data frames
users = users.join(user_ratings, on="user_id", how="left")
# Set null values to 0
users["num_ratings"].fillna(0, inplace=True)
users.head()

In [None]:
# Add a "above_cutoff" column to the users data frame
# This column is True if the user has more than "cutoff" ratings in the ratings data frame
cutoff = 5 # TODO justify this
users["above_cutoff"] = users["num_ratings"] > cutoff
users.head()

In [None]:
ratings["std_rating"] = np.NaN
cutoff_users = users[users["above_cutoff"]]
ratings.loc[cutoff_users.index, "std_rating"] = (
    ratings.loc[cutoff_users.index, "stars"] - cutoff_users["mean_rating"]
) / cutoff_users["std_rating"]

In [None]:
ratings.loc[cutoff_users.index].head(20)

In [None]:
other_users = users[~users["above_cutoff"]]
other_mean = ratings.loc[other_users.index, "stars"].mean()
other_std = ratings.loc[other_users.index, "stars"].std()
ratings.loc[other_users.index, "std_rating"] = (
    ratings.loc[other_users.index, "stars"] - other_mean
) / other_std

In [None]:
ratings.loc[other_users.index].head(20)

In [None]:
min_std = ratings["std_rating"].min()
max_std = ratings["std_rating"].max()
ratings["norm_std_rating"] = (((ratings["std_rating"] - min_std) / (max_std - min_std)) * 4) + 1
ratings.head(20)

In [None]:
reindexed_ratings = ratings.reset_index()
reindexed_ratings.head()

In [None]:
file_name = f"data/02_postprocessing/{'sample/' if SAMPLE else ''}ratings.csv"
with open(file_name, "w") as f:
    reindexed_ratings.to_csv(f, index=False, header=True)

## Businesses

### Categories >> Type, Keywords

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.csv") as f:
    businesses = pd.read_csv(f, header=0)

In [None]:
# Define category grouping based on similarity
category_groups = {
    'Restaurants': ['Restaurants','Food', 'Breakfast & Brunch', 'Fast Food', 'Burgers', 'Pizza', 'Sandwiches', 'American (Traditional)', 'American (New)', 'Mexican', 'Chicken Wings', 'Salad', 'Chinese', 'Cafes', 'Sushi Bars', 'Barbeque', 'Southern', 'Japanese', 'Steakhouses', 'Juice Bars & Smoothies', 'Asian Fusion', 'Diners', 'Tex-Mex', 'Thai', 'Mediterranean', 'Indian', 'Vietnamese', 'Cajun/Creole', 'Latin American', 'Cuban', 'Puerto Rican', 'Caribbean', 'Japanese', 'Spanish', 'Korean', 'French', 'Halal', 'Mongolian', 'Canadian (New)', 'Filipino', 'Greek', 'Brazilian', 'Argentine', 'Hawaiian', 'Afghan', 'Indonesian', 'German', 'Ramen', 'Poke', 'Hot Pot', 'German', 'Pakistani', 'Uzbek', 'Persian/Iranian', 'Russian', 'Burmese', 'New Mexican Cuisine'],
    'Shopping': ['Shopping', 'Fashion', 'Specialty Food', 'Bakeries', 'Grocery', 'Flowers & Gifts', 'Furniture Stores', 'Jewelry', 'Shoe Stores', 'Accessories', 'Vintage & Consignment', 'Sporting Goods', 'Beer, Wine & Spirits', 'Department Stores', 'Bookstores', 'Electronics', 'Drugstores', 'Music & DVDs', 'Toy Stores', 'Tattoo', 'Eyewear & Opticians', 'Optometrists', 'Comic Books', 'Antiques', 'Gift Shops', 'Mobile Phones', 'Discount Store', 'Hardware Stores', 'Pet Stores', 'Appliances & Repair', 'Appliances', 'Vape Shops', 'Outlet Stores', 'Home & Garden', 'Building Supplies', 'Art Supplies', 'Thrift Stores', 'Hobby Shops', 'Musical Instruments & Teachers', 'Mattresses', 'Bike Repair/Maintenance', 'Books, Mags, Music & Video', 'Computers', 'Mobile Phone Accessories', 'Cosmetics & Beauty Supply', 'Eyelash Service', 'Hair Extensions', 'Hair Stylists', 'Waxing', 'Cosmetic Surgeons', 'Makeup Artists', 'Hair Loss Centers', 'Medical Spas', 'Nail Technicians', 'Hair Salons', 'Nail Salons', 'Massage', 'Day Spas', 'Barbers', 'Massage Therapy', 'Tattoo Removal', 'Piercing', 'Laser Hair Removal', 'Eyebrow Services', 'Permanent Makeup', 'Acne Treatment', 'Weight Loss Centers', 'Medical Supplies', 'Optometrists', 'Health Markets', 'Nutritionists', 'Physical Therapy', 'Dermatologists', 'Chiropractors', 'Hospitals', 'Acupuncture', 'Cryotherapy', 'Medical Centers', 'Sports Medicine', 'Alternative Medicine', 'Prenatal/Perinatal Care', 'Ophthalmologists', 'Internal Medicine', 'Allergists', 'Audiologist', 'Ear Nose & Throat', 'Allergists'],
    'Home Services': ['Home Services', 'Local Services', 'Real Estate', 'Contractors', 'Apartments', 'Home Decor', 'Movers', 'Mattresses', 'Appliances & Repair', 'Appliances', 'Home & Garden', 'Building Supplies', 'Security Systems', 'Waterproofing', 'Insulation Installation', 'Pest Control', 'Heating & Air Conditioning/HVAC', 'Plumbing', 'Carpet Cleaning', 'Air Duct Cleaning', 'Home Inspectors', 'Electricians', 'Solar Installation', 'Solar Panel Cleaning', 'Windows Installation', 'Glass & Mirrors', 'Window Washing', 'Pressure Washers', 'Roofing', 'Gutter Services', 'Siding', 'Carpenters', 'Masonry/Concrete', 'Fireplace Services', 'Carpeting', 'Home Automation', 'Handyman', 'Painters', 'Landscape Architects', 'Irrigation', 'Fences & Gates', 'Pool & Hot Tub Service', 'Pool Cleaners', 'Tree Services', 'Septic Services', 'Water Purification Services', 'Oil Change Stations', 'Excavation Services', 'Snow Removal'],
    'Beauty & Spas': [ 'Beauty & Spas', 'Hair Salons', 'Nail Salons', 'Hair Removal', 'Skin Care', 'Day Spas', 'Barbers', 'Massage', 'Waxing', 'Massage Therapy', 'Tattoo Removal', 'Piercing', 'Laser Hair Removal', 'Eyebrow Services', 'Permanent Makeup', 'Acne Treatment', 'Weight Loss Centers', 'Health Markets', 'Nutritionists', 'Physical Therapy', 'Dermatologists', 'Chiropractors', 'Medical Spas', 'Cosmetic Surgeons', 'Makeup Artists', 'Hair Loss Centers', 'Medical Supplies', 'Optometrists', 'Alternative Medicine', 'Prenatal/Perinatal Care', 'Ophthalmologists', 'Internal Medicine', 'Allergists', 'Audiologist', 'Ear Nose & Throat', 'Allergists'],
    'Nightlife': ['Nightlife', 'Bars', 'Sports Bars', 'Pubs', 'Cocktail Bars', 'Beer Gardens', 'Wine Bars', 'Karaoke', 'Breweries', 'Dive Bars', 'Wine Tasting Room', 'Jazz & Blues', 'Tiki Bars', 'Bartenders', 'Distilleries'],
    'Other': []
}
# Make the lists into sets
for key in category_groups.keys():
    category_groups[key] = set([category.lower() for category in category_groups[key]])

In [None]:
# Make category a list
businesses["categories"] = businesses["categories"].apply(lambda x: x.split(", ") if type(x) == str else [])
# Make category lowercase
businesses["categories"] = businesses["categories"].apply(lambda x: set([category.lower() for category in x]))
# Map according to the category_groups mapping
businesses["categories"] = businesses["categories"].apply(lambda x: [cat_name for cat_name, cat_values in category_groups.items() if len(cat_values.intersection(x)) > 0])
# Pick the top category according to the following order of preference: Restaurants > Shopping > Home Services > Beauty & Spas > Nightlife > Other
def category_sort(category) -> int:
    return ["Restaurants", "Shopping", "Home Services", "Beauty & Spas", "Nightlife", "Other"].index(category)
businesses["categories"] = businesses["categories"].apply(
    lambda x: sorted(list(x), key=category_sort)[0] if len(x) > 0 else "Other")
# Rename category to type
businesses.rename(columns={"categories": "type"}, inplace=True)
businesses.head()

In [None]:
file_name = f"data/02_postprocessing/{'sample/' if SAMPLE else ''}businesses.csv"
with open(file_name, "w") as f:
    businesses.to_csv(f, index=False, header=True)

### Keyword Extraction

In [None]:
# read business csv file
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.csv") as f:
    business_df = pd.read_csv(f, header=0)

In [None]:
stop_words = set(stopwords.words("english"))
custom_stop_words = [",", "&", "-", "(", ")", ".", "'", "!", "?", ":", ";", "[", "]", "/", "I", "'ve", "'s", "n't", "a", "``", "also", "'ll", "$", "'d", " "]
# add numbers to the custom_stop_words list 1 to 100
custom_stop_words.extend([str(i) for i in range(100)])
# combine the stop_words and custom_stop_words lists and make it into a set
all_stop_words = set(stopwords.words("english") + custom_stop_words)

In [None]:
# Extract keywords from each business entry for category and name attributes using dictionary; make lowercase
business_df["categories"] = business_df["categories"].str.lower()
business_df["name"] = business_df["name"].str.lower()

# make a new data frame for keywords from categories and name attributes
# split the categories string into a list of words and remove stop words
# if categories is None don't run
if business_df["categories"].isnull().values.any() == False:
    business_df["categories"] = business_df["categories"].apply(lambda x: [word for word in re.split("\W+", x) if word not in all_stop_words])
business_df["name"] = business_df["name"].apply(lambda x: [word for word in re.split("\W+", x) if word not in all_stop_words])
business_keywords_df = business_df[["business_id", "name", "categories"]].reset_index(drop=True)
# combine the name and categories lists into a keywords list
business_keywords_df["keywords"] = business_keywords_df["name"].astype(str) + business_keywords_df["categories"].astype(str)
# remove the name and categories columns
business_keywords_df = business_keywords_df.drop(columns=["name", "categories"])
# explode the keywords list
business_keywords_addition = business_keywords_df.explode("keywords").reset_index(drop=True)
# rename the column
business_keywords_addition.rename(columns={"keywords": "keyword"}, inplace=True)


In [None]:
# read review JSON file
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}reviews.csv") as f:
    review_df = pd.read_csv(f, header=0)

In [None]:
# combine the stop_words and custom_stop_words lists and make it into a set
all_stop_words = set(stopwords.words("english") + custom_stop_words)

In [None]:
review_df["text"] = review_df["text"].str.lower() # make text lowercase
# Make the text column a list by tokenizing it
def tokenize(text):
    if text is None:
        return []
    return [word for word in re.split('\W+', text) if word not in all_stop_words]
review_df["text"] = review_df["text"].apply(tokenize)

In [None]:
# Make a new data frame for keywords
review_keywords_df = review_df[["business_id", "text"]].explode("text").rename(columns={"text": "keyword"})

In [None]:
# Create a DataFrame with keyword counts
temp = review_keywords_df.groupby(["business_id", "keyword"]).size().reset_index(name="count")

In [None]:
# For each business id, get the top 10 keywords with the highest count
top_keywords_per_business = (temp.groupby("business_id")
                             .apply(lambda x: x.nlargest(5, "count"))
                             .reset_index(drop=True))

In [None]:
top_keywords_per_business.head(15)

In [None]:
business_keywords = pd.concat([business_keywords_addition, top_keywords_per_business[["business_id", "keyword"]]], ignore_index=True)
business_keywords.drop_duplicates(inplace=True)
business_keywords.reset_index(drop=True, inplace=True)

# remove the rows with no keywords
business_keywords = business_keywords[business_keywords["keyword"] != ""]

In [None]:
# Store the business keywords in a csv file
file_name = f"data/02_postprocessing/{'sample/' if SAMPLE else ''}businesses_keywords.csv"
with open(file_name, "w") as f:
    business_keywords.to_csv(f, index=False, header=True)