In [None]:
import os
import json

In [None]:
# Create the "data/01_cleaned" directory if it doesn't exist
if not os.path.exists("data/01_cleaned"):
    os.makedirs("data/01_cleaned")
if not os.path.exists("data/01_cleaned/sample"):
    os.makedirs("data/01_cleaned/sample")

In [None]:
SAMPLE = False

## Businesses

In [None]:
# Creates a new list object 'business' (loading in the JSON file)
with open(f"data/00_original/{'sample/' if SAMPLE else ''}yelp_academic_dataset_business.json", "r") as f:
    businesses = [json.loads(line) for line in f]

In [None]:
keys_to_keep = [
    "longitude",
    "name",
    "categories",
    "review_count",
    "stars",
    "latitude",
    "business_id",
]

In [None]:
# Creates a new object called business that includes the attributes of the keys_to_keep list
for business in businesses:
    for key in list(business.keys()):
        if key not in keys_to_keep:
            business.pop(key)

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json", "w") as f:
    f.write('')
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json", "a") as f:
    for business in businesses:
        f.write(json.dumps(business) + "\n")

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}businesses.json") as f:
    businesses_cleaned = [json.loads(line) for line in f]

In [None]:
# Count most frequently category occurrences in businesses.json
category_counts = {}
for business in businesses_cleaned:
    categories_str = business.get("categories", '')
    if categories_str:
        categories = categories_str.split(', ')
        for category in categories:
            category_counts[category] = category_counts.get(category, 0) + 1

# Find the most frequent categories
category_counts = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
category_counts[:5]

# Create a list of the most frequent categories
top_categories = [category[0] for category in category_counts[:5]]

# print the top 5 most frequent categories
top_categories

In [None]:
# Keep only the businesses that have at least one of the top 5 categories using a dictonary and create a new attribute "type" for each business that states which of the top 5 categories it belongs to
businesses = {}
for business in businesses_cleaned:
    categories_str = business.get("categories", '')
    if categories_str:
        categories = categories_str.split(', ')
        for category in categories:
            if category in top_categories:
                business["type"] = category
                businesses[business["business_id"]] = business
                break

# Create a list of the business ids that have at least one of the top 5 categories
business_ids = list(businesses.keys())

In [None]:
# clear the old businesses.json file and write the new businesses to it
with open(f"data/01_cleaned/{'sample' if SAMPLE else ''}/businesses.json", "w") as f:
    f.write("")
with open(f"data/01_cleaned/{'sample' if SAMPLE else ''}/businesses.json", "a") as f:
    for business in businesses.values():
        f.write(json.dumps(business) + "\n")

## Reviews

In [None]:
with open(f"data/00_original/{'sample/' if SAMPLE else ''}yelp_academic_dataset_review.json", "r") as f:
    reviews = [json.loads(line) for line in f] # List syntax that creates a new list

In [None]:
keys_to_keep = [
    "business_id",
    "date",
    "review_id",
    "stars",
    "text",
    "user_id",
]

In [None]:
for review in reviews:
    for key in list(review.keys()):
        if key not in keys_to_keep:
            review.pop(key)

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}reviews.json", "w") as f: # creates a new empty file
    f.write('')
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}reviews.json", "a") as f: # writes each review as a new line
    for review in reviews:
        f.write(json.dumps(review) + "\n")

## Users

In [None]:
with open(f"data/00_original/{'sample/' if SAMPLE else ''}yelp_academic_dataset_user.json", "r") as f:
    users = [json.loads(line) for line in f]

In [None]:
keys_to_keep = [
    "average_stars",
    "friends",
    "name",
    "review_count",
    "user_id",
]

In [None]:
for user in users:
    for key in list(user.keys()):
        if key not in keys_to_keep:
            user.pop(key)

Now removing users that don't have friends

In [None]:
# Change the friends property to a list instead of a string
for user in users:
    user["friends"] = user["friends"].split(", ")

In [None]:
# Remove users that don't have friends
users = [user for user in users if len(user["friends"]) > 0 and user["friends"] != ["None"]]

In [None]:
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.json", "w") as f:
    f.write('')
with open(f"data/01_cleaned/{'sample/' if SAMPLE else ''}users.json", "a") as f:
    for user in users:
        f.write(json.dumps(user) + "\n")