# Preprocessing

In [47]:
import pandas as pd
import json
import numpy as np

# Function to load JSON file into Pandas DataFrame
def load_json(filename):
    """Loads a JSON file into a pandas DataFrame"""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load Yelp dataset
business_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_business.json")
review_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_review.json")
user_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_user.json")
print(f"business_df {business_df.shape}")
print(f"review_df {review_df.shape}")
print(f"user_df {user_df.shape}")

# Filter only OPEN businesses in CALIFORNIA W/ 'categories' column exists and is not null
business_df = business_df[(business_df['state'] == 'CA') & (business_df['is_open'] == 1) & (business_df['categories'].notna())]
print(f"business_df {business_df.shape}")
print(business_df.head())

business_df (150346, 14)
review_df (6990280, 9)
user_df (1987897, 22)
business_df (4064, 14)
                business_id                             name  \
26   noByYNtDLQAra9ccqxdfDw                              H&M   
85   IDtLPgUrqorrpqSLdfMhZQ             Helena Avenue Bakery   
91   nUqrF-h9S7myCcvNDecOvw             Iron Horse Auto Body   
120  bYjnX_J1bHZob10DoSFkqQ      Tinkle Belle Diaper Service   
141  SZU9c8V2GuREDN5KgyHFJw  Santa Barbara Shellfish Company   

                   address           city state postal_code   latitude  \
26        827-833 State St  Santa Barbara    CA       93101  34.420209   
85   131 Anacapa St, Ste C  Santa Barbara    CA       93101  34.414445   
91          825 Cacique St  Santa Barbara    CA       93103  34.419620   
120                         Santa Barbara    CA       93101  34.420334   
141      230 Stearns Wharf  Santa Barbara    CA       93101  34.408715   

      longitude  stars  review_count  is_open  \
26  -119.700460    3.0      

In [48]:
# Extract and print all unique categories from business_df
unique_categories = set()
business_df['categories'].apply(lambda x: unique_categories.update(x.split(', ')))
print(f"Number of unique categories: {len(unique_categories)}")
print(unique_categories)

Number of unique categories: 946
{'Aquariums', 'Bocce Ball', 'Arts & Entertainment', 'Nurseries & Gardening', 'Indonesian', 'Orthodontists', 'Pet Adoption', 'Ayurveda', 'Guns & Ammo', 'Threading Services', 'Champagne Bars', 'Paint & Sip', 'Health Markets', 'Session Photography', 'Christmas Trees', 'Contractors', 'Music & Video', 'Eyelash Service', 'Marketing', 'Cardiologists', 'Lighting Stores', 'Pet Training', 'Martial Arts', 'Orthopedists', 'Naturopathic/Holistic', 'Donuts', 'Apartments', 'Framing', 'Italian', 'Adult Education', 'Museums', 'Vehicle Wraps', 'Propane', 'Real Estate', 'Nail Technicians', 'Shredding Services', 'Middle Eastern', 'Colleges & Universities', 'Public Art', 'Mortgage Brokers', 'IV Hydration', 'Amusement Parks', 'Himalayan/Nepalese', 'Barbeque', 'Spine Surgeons', 'Lakes', 'Leather Goods', 'Tanning Beds', 'Laser Eye Surgery/Lasik', 'Home Health Care', 'Costumes', 'Vocational & Technical School', 'Pilates', 'Live/Raw Food', 'Karaoke', 'Hospice', 'Sandwiches', 'Ch

In [49]:
# Convert categories to lowercase for consistency
business_df['categories'] = business_df['categories'].apply(lambda x: [category.lower() for category in x.split(', ')])

In [50]:
# Define restaurant-related keywords
restaurant_keywords = ["bars","donuts","barbeque", "sandwiches","wineries","fish & chips","vegetarian", "beer", "food", "dessert", "gelato", "restaurants", "wine", "tacos", "tea", "acai bowls", "whiskey", "juice bars & smoothies", "poke", "spirits", "cocktail", "salad", "coffee", "bakeries", "breweries", "pizza", "burgers", "soup", "bagels", "ice cream & frozen yogurt", "ramen", "chicken wings", "food trucks", "cafes", "seafood", "vegan", "diners", "noodles"]

# Filter RESTAURANTS ONLY (businesses with restaurant-related keywords)
restaurants_df = business_df[business_df['categories'].apply(lambda x: any(keyword in x for keyword in restaurant_keywords))]

print(f"restaurants_df {restaurants_df.shape}")

# Select relevant columns // we may get rid of address related columns if we won't use them
restaurants_df = restaurants_df[['business_id', 'name', 'address', 'city', 'postal_code', 'latitude', 'longitude', 'categories', 'stars', 'review_count']]

print(f"restaurants_df {restaurants_df.shape}")
print(restaurants_df.head())

# Drop rows with missing values // there is none
#df_clean = restaurants_df.dropna()
#print(f"df_clean {df_clean.shape}")

restaurants_df (1015, 14)
restaurants_df (1015, 10)
                business_id                             name  \
85   IDtLPgUrqorrpqSLdfMhZQ             Helena Avenue Bakery   
141  SZU9c8V2GuREDN5KgyHFJw  Santa Barbara Shellfish Company   
431  ifjluUv4VASwmFqEp8cWlQ                    Marty's Pizza   
470  VeFfrEZ4iWaecrQg6Eq4cg                         Cal Taco   
555  bdfZdB2MTXlT6-RBjSIpQg                       Pho Bistro   

                       address           city postal_code   latitude  \
85       131 Anacapa St, Ste C  Santa Barbara       93101  34.414445   
141          230 Stearns Wharf  Santa Barbara       93101  34.408715   
431         2733 De La Vina St  Santa Barbara       93105  34.436236   
470  7320 Hollister Ave, Ste 1         Goleta       93117  34.430542   
555  903 Embarcadero Del Norte     Isla Vista       93117  34.412934   

      longitude                                         categories  stars  \
85  -119.690672  [food, restaurants, salad, coffee & 