# Preprocessing

In [2]:
import pandas as pd
import json
import numpy as np

# Function to load JSON file into Pandas DataFrame
def load_json(filename):
    """Loads a JSON file into a pandas DataFrame"""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load Yelp dataset
business_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_business.json")
review_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_review.json")
user_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_user.json")
print(f"business_df {business_df.shape}")
print(f"review_df {review_df.shape}")
print(f"user_df {user_df.shape}")

# Filter only OPEN businesses in CALIFORNIA W/ 'categories' column exists and is not null
business_df = business_df[(business_df['state'] == 'CA') & (business_df['is_open'] == 1) & (business_df['categories'].notna())]
print(f"business_df {business_df.shape}")
print(business_df.head())

business_df (150346, 14)
review_df (6990280, 9)
user_df (1987897, 22)
business_df (4064, 14)
                business_id                             name  \
26   noByYNtDLQAra9ccqxdfDw                              H&M   
85   IDtLPgUrqorrpqSLdfMhZQ             Helena Avenue Bakery   
91   nUqrF-h9S7myCcvNDecOvw             Iron Horse Auto Body   
120  bYjnX_J1bHZob10DoSFkqQ      Tinkle Belle Diaper Service   
141  SZU9c8V2GuREDN5KgyHFJw  Santa Barbara Shellfish Company   

                   address           city state postal_code   latitude  \
26        827-833 State St  Santa Barbara    CA       93101  34.420209   
85   131 Anacapa St, Ste C  Santa Barbara    CA       93101  34.414445   
91          825 Cacique St  Santa Barbara    CA       93103  34.419620   
120                         Santa Barbara    CA       93101  34.420334   
141      230 Stearns Wharf  Santa Barbara    CA       93101  34.408715   

      longitude  stars  review_count  is_open  \
26  -119.700460    3.0      

In [3]:
# Extract and print all unique categories from business_df
unique_categories = set()
business_df['categories'].apply(lambda x: unique_categories.update(x.split(', ')))
print(f"Number of unique categories: {len(unique_categories)}")
print(unique_categories)

Number of unique categories: 946
{'Public Transportation', 'Batting Cages', 'Crane Services', 'Massage Schools', 'Cuban', 'Solar Installation', 'Tui Na', 'Greek', 'Tree Services', 'Private Investigation', 'Party Bus Rentals', 'Obstetricians & Gynecologists', 'Orthopedists', 'Patisserie/Cake Shop', 'Beer Gardens', 'Acne Treatment', 'Naturopathic/Holistic', 'General Litigation', 'Banks & Credit Unions', 'Preventive Medicine', 'Saunas', 'Dermatologists', 'International Grocery', 'Firewood', 'Arts & Crafts', 'Montessori Schools', 'Oncologist', 'Ear Nose & Throat', 'Building Supplies', 'Spiritual Shop', 'Auction Houses', 'Mortuary Services', 'Child Care & Day Care', 'Home Network Installation', 'Couriers & Delivery Services', 'Audio/Visual Equipment Rental', 'Party Equipment Rentals', 'Counseling & Mental Health', 'Food Delivery Services', 'Software Development', 'Rafting/Kayaking', 'Wheel & Rim Repair', 'Pool & Hot Tub Service', 'Beer', 'Investing', 'Olive Oil', 'Pet Groomers', 'Utilities'

In [4]:
# Convert categories to lowercase for consistency
business_df['categories'] = business_df['categories'].apply(lambda x: [category.lower() for category in x.split(', ')])

In [5]:
# Define restaurant-related keywords
restaurant_keywords = ["bars","donuts","barbeque", "sandwiches","wineries","fish & chips","vegetarian", "beer", "food", "dessert", "gelato", "restaurants", "wine", "tacos", "tea", "acai bowls", "whiskey", "juice bars & smoothies", "poke", "spirits", "cocktail", "salad", "coffee", "bakeries", "breweries", "pizza", "burgers", "soup", "bagels", "ice cream & frozen yogurt", "ramen", "chicken wings", "food trucks", "cafes", "seafood", "vegan", "diners", "noodles"]

# Filter RESTAURANTS ONLY (businesses with restaurant-related keywords)
restaurants_df = business_df[business_df['categories'].apply(lambda x: any(keyword in x for keyword in restaurant_keywords))]

print(f"restaurants_df {restaurants_df.shape}")

# Select relevant columns // we may get rid of address related columns if we won't use them
restaurants_df = restaurants_df[['business_id', 'name', 'address', 'city', 'postal_code', 'latitude', 'longitude', 'categories', 'stars', 'review_count']]

print(f"restaurants_df {restaurants_df.shape}")
print(restaurants_df.head())

# Drop rows with missing values // there is none
#df_clean = restaurants_df.dropna()
#print(f"df_clean {df_clean.shape}")

restaurants_df (1015, 14)
restaurants_df (1015, 10)
                business_id                             name  \
85   IDtLPgUrqorrpqSLdfMhZQ             Helena Avenue Bakery   
141  SZU9c8V2GuREDN5KgyHFJw  Santa Barbara Shellfish Company   
431  ifjluUv4VASwmFqEp8cWlQ                    Marty's Pizza   
470  VeFfrEZ4iWaecrQg6Eq4cg                         Cal Taco   
555  bdfZdB2MTXlT6-RBjSIpQg                       Pho Bistro   

                       address           city postal_code   latitude  \
85       131 Anacapa St, Ste C  Santa Barbara       93101  34.414445   
141          230 Stearns Wharf  Santa Barbara       93101  34.408715   
431         2733 De La Vina St  Santa Barbara       93105  34.436236   
470  7320 Hollister Ave, Ste 1         Goleta       93117  34.430542   
555  903 Embarcadero Del Norte     Isla Vista       93117  34.412934   

      longitude                                         categories  stars  \
85  -119.690672  [food, restaurants, salad, coffee & 

In [6]:
# Select relevant columns from review_df // I dropped review_id, date, useful, funny, cool
review_df = review_df[['user_id','business_id','stars','text']]
print(review_df.head())

# Merge reviews with restaurant using business_id as a key
merged_df = review_df.merge(restaurants_df[['business_id', 'name', 'categories']], on='business_id')
print(merged_df.head())

                  user_id             business_id  stars  \
0  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw    3.0   
1  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ    5.0   
2  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A    3.0   
3  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA    5.0   
4  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ    4.0   

                                                text  
0  If you decide to eat here, just be aware it is...  
1  I've taken a lot of spin classes over the year...  
2  Family diner. Had the buffet. Eclectic assortm...  
3  Wow!  Yummy, different,  delicious.   Our favo...  
4  Cute interior and owner (?) gave us tour of up...  
                  user_id             business_id  stars  \
0  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg    3.0   
1  OhECKhQEexFypOMY6kypRw  vC2qm1y3Au5czBtbhc-DNw    4.0   
2  4hBhtCSgoxkrFgHa4YAD-w  bbEXAEFr4RYHLlZ-HFssTA    5.0   
3  bFPdtzu11Oi0f92EAcjqmg  IDtLPgUrqorrpqSLdfMhZQ    5.0   
4  JYYYKt

In [7]:
# Fix category column if it's a list -> string
merged_df['categories'] = merged_df['categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
print(merged_df.head())

                  user_id             business_id  stars  \
0  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg    3.0   
1  OhECKhQEexFypOMY6kypRw  vC2qm1y3Au5czBtbhc-DNw    4.0   
2  4hBhtCSgoxkrFgHa4YAD-w  bbEXAEFr4RYHLlZ-HFssTA    5.0   
3  bFPdtzu11Oi0f92EAcjqmg  IDtLPgUrqorrpqSLdfMhZQ    5.0   
4  JYYYKt6TdVA4ng9lLcXt_g  SZU9c8V2GuREDN5KgyHFJw    5.0   

                                                text  \
0  Had a party of 6 here for hibachi. Our waitres...   
1  Yes, this is the only sushi place in town. How...   
2  Great burgers,fries and salad!  Burgers have a...   
3  What a great addition to the Funk Zone!  Grab ...   
4  We were a bit weary about trying the Shellfish...   

                              name  \
0  Hibachi Steak House & Sushi Bar   
1                       Sushi Teri   
2  The Original Habit Burger Grill   
3             Helena Avenue Bakery   
4  Santa Barbara Shellfish Company   

                                          categories  
0     steakhouses,

# Feature Extraction & Similarity Computation

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature Extraction for Content-Based Filtering: (TF-IDF) on categories and reviews
tfidf_category = TfidfVectorizer(stop_words='english')
tfidf_review = TfidfVectorizer(stop_words='english')
category_matrix = tfidf_category.fit_transform(merged_df['categories'])
review_matrix = tfidf_review.fit_transform(merged_df['text'])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrices
category_similarity = cosine_similarity(category_matrix)
review_similarity = cosine_similarity(review_matrix)

# Combine the two similarity matrices into a total similarity matrix by taking their average: 
# Equal weight to both categories and reviews when computing the overall similarity between restaurants.
total_similarity = (category_similarity + review_similarity) / 2

MemoryError: Unable to allocate 242. GiB for an array with shape (32478999457,) and data type int64

In [10]:
from sklearn.neighbors import NearestNeighbors

# Switched to use NearestNeighbors for memory-efficient similarity computation
category_nn = NearestNeighbors(metric='cosine').fit(category_matrix)
review_nn = NearestNeighbors(metric='cosine').fit(review_matrix)

# Collaborative Filtering

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Split the data into train nd test sets (80,20 split)
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

# User-Restaurant interaction matrix for Collaborative Filtering using 'stars'
user_restaurant_matrix = train_data.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)
print(user_restaurant_matrix.head())

# Calculate the total number of elements in the matrix
total_elements = user_restaurant_matrix.size
# Calculate the number of non-zero elements
non_zero_elements = (user_restaurant_matrix != 0).sum().sum()
# Calculate density
density = non_zero_elements / total_elements
# Calculate sparsity
sparsity = 1 - density
print(f"Total Elements: {total_elements}")
print(f"Non-Zero Elements: {non_zero_elements}")
print(f"Density: {density:.4f}")
print(f"Sparsity: {sparsity:.4f}")

# Compute user similarity matrix
user_similarity = cosine_similarity(user_restaurant_matrix)
print(user_similarity)

business_id             -3AooxIkg38UyUdlz5oXdw  -6jvfSJGprbfBD2QrS9zQw  \
user_id                                                                  
---zemaUC8WeJeWKqS6p9Q                     0.0                     0.0   
--17Db1K-KujRuN7hY9Z0Q                     0.0                     0.0   
--2F5G5LKt3h2cAXJbZptg                     0.0                     0.0   
--2vR0DIsmQ6WfcSzKWigw                     0.0                     0.0   
--50YzjtBsdxOGVqTkvaKA                     0.0                     0.0   

business_id             -9r8nAzWyRSLxBWt8uQOdA  -ALqLSTzkGDMscHdxA1NgA  \
user_id                                                                  
---zemaUC8WeJeWKqS6p9Q                     0.0                     0.0   
--17Db1K-KujRuN7hY9Z0Q                     0.0                     0.0   
--2F5G5LKt3h2cAXJbZptg                     0.0                     0.0   
--2vR0DIsmQ6WfcSzKWigw                     0.0                     0.0   
--50YzjtBsdxOGVqTkvaKA               

MemoryError: Unable to allocate 54.4 GiB for an array with shape (85483, 85483) and data type float64

In [12]:
# Filter users with at least 5 ratings to save on memory
user_counts = train_data['user_id'].value_counts()
filtered_df = train_data[train_data['user_id'].isin(user_counts[user_counts >= 5].index)]

# Filter restaurants with at least 5 ratings
item_counts = filtered_df['business_id'].value_counts()
filtered_df = filtered_df[filtered_df['business_id'].isin(item_counts[item_counts >= 5].index)]

# Recompute User-Restaurant interaction matrix using filtered_df
user_restaurant_matrix = filtered_df.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)
print(user_restaurant_matrix.head())

# Calculate the total number of elements in the matrix
total_elements = user_restaurant_matrix.size
# Calculate the number of non-zero elements
non_zero_elements = (user_restaurant_matrix != 0).sum().sum()
# Calculate density
density = non_zero_elements / total_elements
# Calculate sparsity
sparsity = 1 - density
print(f"Total Elements: {total_elements}")
print(f"Non-Zero Elements: {non_zero_elements}")
print(f"Density: {density:.4f}")
print(f"Sparsity: {sparsity:.4f}")

# Recompute user similarity matrix
user_similarity = cosine_similarity(user_restaurant_matrix)
print(user_similarity)

business_id             -3AooxIkg38UyUdlz5oXdw  -6jvfSJGprbfBD2QrS9zQw  \
user_id                                                                  
-0-TtVhV4PIUoDpUCOC0uQ                     0.0                     0.0   
-0EcgtUXe1rzrkmdih_tYg                     0.0                     0.0   
-1-ECBsGpG4Iw5s-ecnfqw                     0.0                     0.0   
-14MA777BbjUQLw0zndvfA                     0.0                     0.0   
-1WbN1Qd-opw8u3uEqs2Kg                     0.0                     0.0   

business_id             -9r8nAzWyRSLxBWt8uQOdA  -ALqLSTzkGDMscHdxA1NgA  \
user_id                                                                  
-0-TtVhV4PIUoDpUCOC0uQ                     0.0                     0.0   
-0EcgtUXe1rzrkmdih_tYg                     0.0                     0.0   
-1-ECBsGpG4Iw5s-ecnfqw                     0.0                     0.0   
-14MA777BbjUQLw0zndvfA                     0.0                     0.0   
-1WbN1Qd-opw8u3uEqs2Kg               

In [18]:
def get_cf_recommendations(user_id, user_similarity, user_restaurant_matrix, merged_df, top_n=5):
    """
    Generate restaurant recommendations for a user using collaborative filtering.
    
    Args:
        user_id (str): The ID of the user for whom recommendations are generated.
        user_similarity (numpy array): User similarity matrix.
        user_restaurant_matrix (DataFrame): User-restaurant interaction matrix.
        merged_df (DataFrame): The original DataFrame containing restaurant names.
        top_n (int): Number of recommendations to return.
    
    Returns:
        list: List of recommended restaurant IDs.
    """
    # Get the index of the user in the user-item matrix
    user_index = user_restaurant_matrix.index.get_loc(user_id)
    
    # Get the similarity scores for the user
    user_sim_scores = user_similarity[user_index]
    
    # Get the indices of the most similar users (excluding the user themselves)
    similar_users_indices = user_sim_scores.argsort()[::-1][1:top_n + 1]
    
    # Get the restaurants rated by similar users
    recommended_restaurants = set()
    for sim_user_index in similar_users_indices:
        sim_user_id = user_restaurant_matrix.index[sim_user_index]
        # Get restaurants rated highly by the similar user
        sim_user_ratings = user_restaurant_matrix.loc[sim_user_id]
        highly_rated_restaurants = sim_user_ratings[sim_user_ratings >= 4].index  # Restaurants rated 4 or 5 stars ONLY
        recommended_restaurants.update(highly_rated_restaurants)
    
    # Exclude restaurants already rated by the user
    user_rated_restaurants = user_restaurant_matrix.loc[user_id][user_restaurant_matrix.loc[user_id] > 0].index
    recommended_restaurants = recommended_restaurants - set(user_rated_restaurants)
    
    # Get the names & categories of the recommended restaurants
    recommended_restaurant_names = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['name'].unique()
    print(list(recommended_restaurant_names)[:top_n])
    recommended_restaurant_categories = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['categories'].unique()
    print(list(recommended_restaurant_categories)[:top_n])

    return list(recommended_restaurants)[:top_n]

In [19]:
# Example: Get CF recommendations for a specific user
user_id = '-0-TtVhV4PIUoDpUCOC0uQ'
cf_recommendations = get_cf_recommendations(user_id, user_similarity, user_restaurant_matrix, merged_df, top_n=5)
print("Recommended Restaurants:", cf_recommendations)

['The Original Habit Burger Grill', 'Phamous Cafe', 'Poke Theory SB', 'Eureka!', 'Cajé Coffee Roasters - Isla Vista']
['fast food, burgers, restaurants', 'food, vietnamese, restaurants, bubble tea', 'coffee & tea, noodles, restaurants, seafood, japanese, ramen, food, asian fusion, poke', 'restaurants, american (traditional), lounges, nightlife, bars, burgers, american (new)', 'food, internet cafes, coffee & tea, acai bowls, juice bars & smoothies, bagels']
Recommended Restaurants: ['DqVpNtgFCP47n6frh-LREA', 'OzGk7arJv6FPrWqaYSvyKQ', 'NIQWO-Q_F598XlAvZnFVCA', 'LlGIlNJE2Nv_PXkH7l4Wmg', '1pd4VYTww1UWDVai2r2RKg']
