# Preprocessing

In [1]:
import pandas as pd
import json
import numpy as np

# Function to load JSON file into Pandas DataFrame
def load_json(filename):
    """Loads a JSON file into a pandas DataFrame"""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load Yelp dataset
# business_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_business.json")
# review_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_review.json")
# user_df = load_json(r"C:\BIANCONERI\Master's AI SJSU\5- Advanced Data Mining\Final Project\Yelp-JSON\Yelp JSON\yelp_dataset\yelp_academic_dataset_user.json")
business_df = load_json(r"/home/craig/classwork_2025/CMPE256/FINAL_PROJECT/Yelp-JSON/YelpJSON/yelp_dataset/yelp_academic_dataset_business.json")
review_df = load_json(r"/home/craig/classwork_2025/CMPE256/FINAL_PROJECT/Yelp-JSON/YelpJSON/yelp_dataset/yelp_academic_dataset_review.json")
user_df = load_json(r"/home/craig/classwork_2025/CMPE256/FINAL_PROJECT/Yelp-JSON/YelpJSON/yelp_dataset/yelp_academic_dataset_user.json")
print(f"business_df {business_df.shape}")
print(f"review_df {review_df.shape}")
print(f"user_df {user_df.shape}")

# Filter only OPEN businesses in CALIFORNIA W/ 'categories' column exists and is not null
business_df = business_df[(business_df['state'] == 'CA') & (business_df['is_open'] == 1) & (business_df['categories'].notna())]
print(f"business_df {business_df.shape}")
print(business_df.head())

business_df (150346, 14)
review_df (6990280, 9)
user_df (1987897, 22)
business_df (4064, 14)
                business_id                             name  \
26   noByYNtDLQAra9ccqxdfDw                              H&M   
85   IDtLPgUrqorrpqSLdfMhZQ             Helena Avenue Bakery   
91   nUqrF-h9S7myCcvNDecOvw             Iron Horse Auto Body   
120  bYjnX_J1bHZob10DoSFkqQ      Tinkle Belle Diaper Service   
141  SZU9c8V2GuREDN5KgyHFJw  Santa Barbara Shellfish Company   

                   address           city state postal_code   latitude  \
26        827-833 State St  Santa Barbara    CA       93101  34.420209   
85   131 Anacapa St, Ste C  Santa Barbara    CA       93101  34.414445   
91          825 Cacique St  Santa Barbara    CA       93103  34.419620   
120                         Santa Barbara    CA       93101  34.420334   
141      230 Stearns Wharf  Santa Barbara    CA       93101  34.408715   

      longitude  stars  review_count  is_open  \
26  -119.700460    3.0      

In [2]:
# Extract and print all unique categories from business_df
unique_categories = set()
business_df['categories'].apply(lambda x: unique_categories.update(x.split(', ')))
print(f"Number of unique categories: {len(unique_categories)}")
print(unique_categories)

Number of unique categories: 946
{'Shopping', 'Tax Services', 'Zoos', 'Surf Schools', 'Diners', 'Officiants', 'Drama Schools', 'Employment Law', 'Rheumatologists', 'Refinishing Services', 'Pets', 'Day Camps', 'Professional Services', 'Skilled Nursing', 'Trusts', 'Gift Shops', 'Allergists', 'Balloon Services', 'Adult Education', 'Homeowner Association', 'Health Coach', 'American (Traditional)', 'Pain Management', 'Tennis', 'Furniture Reupholstery', 'Travel Services', 'Utilities', 'Bubble Tea', 'Gymnastics', 'Emergency Pet Hospital', 'Bed & Breakfast', 'Parenting Classes', 'Art Galleries', 'Car Dealers', 'Trains', 'Mailbox Centers', 'Cinema', 'Brewpubs', 'Gastropubs', 'Cooking Classes', 'Auto Parts & Supplies', 'Art Museums', 'Mobile Dent Repair', 'Pet Sitting', 'Photo Booth Rentals', 'Knife Sharpening', 'Septic Services', 'Screen Printing', 'Tabletop Games', 'Mags', 'Laser Eye Surgery/Lasik', 'Community Service/Non-Profit', 'Web Design', 'Vacation Rental Agents', 'Mobile Phone Accessori

In [3]:
# Convert categories to lowercase for consistency
business_df['categories'] = business_df['categories'].apply(lambda x: [category.lower() for category in x.split(', ')])

In [4]:
# Define restaurant-related keywords
restaurant_keywords = ["bars","donuts","barbeque", "sandwiches","wineries","fish & chips","vegetarian", "beer", "food", "dessert", "gelato", "restaurants", "wine", "tacos", "tea", "acai bowls", "whiskey", "juice bars & smoothies", "poke", "spirits", "cocktail", "salad", "coffee", "bakeries", "breweries", "pizza", "burgers", "soup", "bagels", "ice cream & frozen yogurt", "ramen", "chicken wings", "food trucks", "cafes", "seafood", "vegan", "diners", "noodles"]

# Filter RESTAURANTS ONLY (businesses with restaurant-related keywords)
restaurants_df = business_df[business_df['categories'].apply(lambda x: any(keyword in x for keyword in restaurant_keywords))]

print(f"restaurants_df {restaurants_df.shape}")

# Select relevant columns // we may get rid of address related columns if we won't use them
restaurants_df = restaurants_df[['business_id', 'name', 'address', 'city', 'postal_code', 'latitude', 'longitude', 'categories', 'stars', 'review_count']]

print(f"restaurants_df {restaurants_df.shape}")
# print(restaurants_df.head())
restaurants_df


# Drop rows with missing values // there is none
#df_clean = restaurants_df.dropna()
#print(f"df_clean {df_clean.shape}")

restaurants_df (1015, 14)
restaurants_df (1015, 10)


Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,categories,stars,review_count
85,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,93101,34.414445,-119.690672,"[food, restaurants, salad, coffee & tea, break...",4.0,389
141,SZU9c8V2GuREDN5KgyHFJw,Santa Barbara Shellfish Company,230 Stearns Wharf,Santa Barbara,93101,34.408715,-119.685019,"[live/raw food, restaurants, seafood, beer bar...",4.0,2404
431,ifjluUv4VASwmFqEp8cWlQ,Marty's Pizza,2733 De La Vina St,Santa Barbara,93105,34.436236,-119.726147,"[pizza, restaurants]",4.0,64
470,VeFfrEZ4iWaecrQg6Eq4cg,Cal Taco,"7320 Hollister Ave, Ste 1",Goleta,93117,34.430542,-119.882367,"[burgers, cafes, restaurants, mexican, america...",4.0,189
555,bdfZdB2MTXlT6-RBjSIpQg,Pho Bistro,903 Embarcadero Del Norte,Isla Vista,93117,34.412934,-119.855531,"[food, restaurants, chinese, bubble tea, vietn...",3.0,184
...,...,...,...,...,...,...,...,...,...,...
150214,2xxkaRy7rP5EUyjFt2J5kA,Nikka Ramen,5701 Calle Real,Goleta,93117,34.440885,-119.823523,"[restaurants, japanese, ramen]",4.5,562
150215,8IV78gQwTH-eZgbfFS_plg,Lost & Found Cafe,919 Linden Ave,Carpinteria,93013,34.398568,-119.519034,"[home decor, home & garden, coffee & tea, food...",5.0,5
150230,IRBhPAC4ZoDpXazpoB3epQ,Good Stuff Baked Treats,,Santa Barbara,93101,34.420334,-119.710749,"[food, food delivery services, bakeries, desse...",5.0,9
150264,Bo-GGwQNcLVwyKeYkbfAYg,Vino Divino,2012 De La Vina St,Santa Barbara,93105,34.429154,-119.717222,"[event planning & services, wine tasting room,...",5.0,19


In [5]:
review_df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5.0,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5.0,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [6]:
# Select relevant columns from review_df // I dropped review_id, date, useful, funny, cool
review_df = review_df[['user_id','business_id','stars','text', 'date']]
print(review_df.head())

# Merge reviews with restaurant using business_id as a key
merged_df = review_df.merge(restaurants_df[['business_id', 'name', 'categories']], on='business_id')
print(merged_df.head())

                  user_id             business_id  stars  \
0  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw    3.0   
1  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ    5.0   
2  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A    3.0   
3  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA    5.0   
4  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ    4.0   

                                                text                 date  
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm...  2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delicious.   Our favo...  2015-01-04 00:01:03  
4  Cute interior and owner (?) gave us tour of up...  2017-01-14 20:54:15  
                  user_id             business_id  stars  \
0  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg    3.0   
1  OhECKhQEexFypOMY6kypRw  vC2qm1y3Au5czBtbhc-DNw    4.0   
2  

In [7]:
# Fix category column if it's a list -> string
merged_df['categories'] = merged_df['categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
print(merged_df.head())

                  user_id             business_id  stars  \
0  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg    3.0   
1  OhECKhQEexFypOMY6kypRw  vC2qm1y3Au5czBtbhc-DNw    4.0   
2  4hBhtCSgoxkrFgHa4YAD-w  bbEXAEFr4RYHLlZ-HFssTA    5.0   
3  bFPdtzu11Oi0f92EAcjqmg  IDtLPgUrqorrpqSLdfMhZQ    5.0   
4  JYYYKt6TdVA4ng9lLcXt_g  SZU9c8V2GuREDN5KgyHFJw    5.0   

                                                text                 date  \
0  Had a party of 6 here for hibachi. Our waitres...  2016-07-25 07:31:06   
1  Yes, this is the only sushi place in town. How...  2013-09-04 03:48:20   
2  Great burgers,fries and salad!  Burgers have a...  2017-01-02 03:17:34   
3  What a great addition to the Funk Zone!  Grab ...  2016-10-13 22:50:47   
4  We were a bit weary about trying the Shellfish...  2016-05-31 02:14:54   

                              name  \
0  Hibachi Steak House & Sushi Bar   
1                       Sushi Teri   
2  The Original Habit Burger Grill   
3             Helena Ave

# Feature Extraction & Similarity Computation

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature Extraction for Content-Based Filtering: (TF-IDF) on categories and reviews
tfidf_category = TfidfVectorizer(stop_words='english')
tfidf_review = TfidfVectorizer(stop_words='english')
category_matrix = tfidf_category.fit_transform(merged_df['categories'])
review_matrix = tfidf_review.fit_transform(merged_df['text'])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrices
category_similarity = cosine_similarity(category_matrix)
review_similarity = cosine_similarity(review_matrix)

# Combine the two similarity matrices into a total similarity matrix by taking their average: 
# Equal weight to both categories and reviews when computing the overall similarity between restaurants.
total_similarity = (category_similarity + review_similarity) / 2

MemoryError: Unable to allocate 242. GiB for an array with shape (32478999457,) and data type int64

In [None]:
from sklearn.neighbors import NearestNeighbors

# Switched to use NearestNeighbors for memory-efficient similarity computation
category_nn = NearestNeighbors(metric='cosine').fit(category_matrix)
review_nn = NearestNeighbors(metric='cosine').fit(review_matrix)

# Collaborative Filtering

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Split the data into train nd test sets (80,20 split)
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

# User-Restaurant interaction matrix for Collaborative Filtering using 'stars'
user_restaurant_matrix = train_data.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)
print(user_restaurant_matrix.head())

# Calculate the total number of elements in the matrix
total_elements = user_restaurant_matrix.size
# Calculate the number of non-zero elements
non_zero_elements = (user_restaurant_matrix != 0).sum().sum()
# Calculate density
density = non_zero_elements / total_elements
# Calculate sparsity
sparsity = 1 - density
print(f"Total Elements: {total_elements}")
print(f"Non-Zero Elements: {non_zero_elements}")
print(f"Density: {density:.4f}")
print(f"Sparsity: {sparsity:.4f}")

# Compute user similarity matrix
user_similarity = cosine_similarity(user_restaurant_matrix)
print(user_similarity)

In [None]:
# Filter users with at least 5 ratings to save on memory
user_counts = train_data['user_id'].value_counts()
filtered_df = train_data[train_data['user_id'].isin(user_counts[user_counts >= 5].index)]

# Filter restaurants with at least 5 ratings
item_counts = filtered_df['business_id'].value_counts()
filtered_df = filtered_df[filtered_df['business_id'].isin(item_counts[item_counts >= 5].index)]

# Recompute User-Restaurant interaction matrix using filtered_df
user_restaurant_matrix = filtered_df.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)
print(user_restaurant_matrix.head())

# Calculate the total number of elements in the matrix
total_elements = user_restaurant_matrix.size
# Calculate the number of non-zero elements
non_zero_elements = (user_restaurant_matrix != 0).sum().sum()
# Calculate density
density = non_zero_elements / total_elements
# Calculate sparsity
sparsity = 1 - density
print(f"Total Elements: {total_elements}")
print(f"Non-Zero Elements: {non_zero_elements}")
print(f"Density: {density:.4f}")
print(f"Sparsity: {sparsity:.4f}")

# Recompute user similarity matrix
user_similarity = cosine_similarity(user_restaurant_matrix)
print(user_similarity)

In [None]:
def get_cf_recommendations(user_id, user_similarity, user_restaurant_matrix, merged_df, top_n=5):
    """
    Generate restaurant recommendations for a user using collaborative filtering.
    
    Args:
        user_id (str): The ID of the user for whom recommendations are generated.
        user_similarity (numpy array): User similarity matrix.
        user_restaurant_matrix (DataFrame): User-restaurant interaction matrix.
        merged_df (DataFrame): The original DataFrame containing restaurant names.
        top_n (int): Number of recommendations to return.
    
    Returns:
        list: List of recommended restaurant IDs.
    """
    # Get the index of the user in the user-item matrix
    user_index = user_restaurant_matrix.index.get_loc(user_id)
    
    # Get the similarity scores for the user
    user_sim_scores = user_similarity[user_index]
    
    # Get the indices of the most similar users (excluding the user themselves)
    similar_users_indices = user_sim_scores.argsort()[::-1][1:top_n + 1]
    
    # Get the restaurants rated by similar users
    recommended_restaurants = set()
    for sim_user_index in similar_users_indices:
        sim_user_id = user_restaurant_matrix.index[sim_user_index]
        # Get restaurants rated highly by the similar user
        sim_user_ratings = user_restaurant_matrix.loc[sim_user_id]
        highly_rated_restaurants = sim_user_ratings[sim_user_ratings >= 4].index  # Restaurants rated 4 or 5 stars ONLY
        recommended_restaurants.update(highly_rated_restaurants)
    
    # Exclude restaurants already rated by the user
    user_rated_restaurants = user_restaurant_matrix.loc[user_id][user_restaurant_matrix.loc[user_id] > 0].index
    recommended_restaurants = recommended_restaurants - set(user_rated_restaurants)
    
    # Get the names & categories of the recommended restaurants
    recommended_restaurant_names = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['name'].unique()
    print(list(recommended_restaurant_names)[:top_n])
    recommended_restaurant_categories = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['categories'].unique()
    print(list(recommended_restaurant_categories)[:top_n])

    return list(recommended_restaurants)[:top_n]

In [None]:
# Example: Get CF recommendations for a specific user
user_id = '-0-TtVhV4PIUoDpUCOC0uQ'
recommendations = get_cf_recommendations(user_id, user_similarity, user_restaurant_matrix, merged_df, top_n=5)
print("Recommended Restaurants:", recommendations)

## experiments other similarities

In [27]:
# CF filtering (cosine)

# Filter users with at least 5 ratings to save on memory
user_counts = train_data['user_id'].value_counts()
filtered_df = train_data[train_data['user_id'].isin(user_counts[user_counts >= 5].index)]

# Filter restaurants with at least 5 ratings
item_counts = filtered_df['business_id'].value_counts()
filtered_df = filtered_df[filtered_df['business_id'].isin(item_counts[item_counts >= 5].index)]

# Recompute User-Restaurant interaction matrix using filtered_df
train_user_restaurant_matrix = filtered_df.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)

# Fill missing values with 0 (indicating no rating for that business)
train_user_restaurant_matrix = train_user_restaurant_matrix.fillna(0)


In [28]:
from sklearn.metrics.pairwise import cosine_similarity

train_user_similarity = cosine_similarity(train_user_restaurant_matrix)

train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)


In [31]:
def recommend_businesses(user_id, user_similarity_df, user_item_matrix, num_recommendations=5):
    if user_id not in user_similarity_df.index:
        print("not enough info for user")
        return user_item_matrix.mean(axis=0).sort_values(ascending=False).head(num_recommendations).index
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:num_recommendations+1]
    
    similar_users_ratings = user_item_matrix.loc[similar_users]
    
    #should normalize the ratings first?
    recommended_businesses = similar_users_ratings.sum(axis=0).sort_values(ascending=False)
    
    recommended_businesses = recommended_businesses[user_item_matrix.loc[user_id] == 0].head(num_recommendations)
    
    return recommended_businesses.index

# Test on specific user
recommended_restaurants = recommend_businesses(user_id='-0-TtVhV4PIUoDpUCOC0uQ', user_similarity_df=train_user_similarity_df, user_item_matrix=train_user_restaurant_matrix)

names = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['name'].unique()
print(names)

['Poke Theory SB' "Woodstock's Pizza Isla Vista" 'Los Agaves'
 'TAP Thai Cuisine' "Andersen's Danish Bakery & Restaurant"]


In [13]:
## content based filtering (cosine)

# Preprocess categories column (split categories into separate columns)

business_df['categories'] = business_df['categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

from sklearn.feature_extraction.text import TfidfVectorizer

business_df.reset_index(inplace=True)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(business_df['categories'])

In [14]:
# Calculate business similarity based on their categories
business_similarity = cosine_similarity(tfidf_matrix)

# Convert to DataFrame for easier interpretation
# business_similarity_df = pd.DataFrame(business_similarity, index=business_df['business_id'], columns=business_df['business_id'])

In [34]:
# I don't think this is working

def recommend_businesses_content(user_id, business_similarity_df, user_item_matrix, num_recommendations=5):
    rated_businesses = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] >= 4].index

    similar_scores = business_similarity_df.loc[rated_businesses].mean(axis=0)  # Mean similarity score across rated businesses

    similar_scores = similar_scores.sort_values(ascending=False)

    recommended_businesses = similar_scores[~similar_scores.index.isin(rated_businesses)]

    recommended_businesses = recommended_businesses.head(num_recommendations)
    
    return recommended_businesses.index

recommended_restaurants = recommend_businesses_content(user_id='-0-TtVhV4PIUoDpUCOC0uQ', business_similarity_df=business_similarity_df, user_item_matrix=user_restaurant_matrix)
names = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['name'].unique()
print(names)

['KFC' 'Your Choice' "Kyle's Kitchen" 'Hochaya']


In [15]:
def recommend_businesses(business_id, cosine_sim, num_recommendations=5):
    idx = business_df[business_df['business_id'] == business_id].index[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:num_recommendations+1]
    
    business_indices = [i[0] for i in sim_scores]
    
    return business_df.iloc[business_indices][['business_id', 'name', 'categories']]

recommend_businesses(business_id='P_QAD4_iMgqTaWd5On_n3g', cosine_sim=business_similarity)
# names = merged_df[merged_df['business_id'].isin(recommended_restaurants)]['name'].unique()
# print(names)

Unnamed: 0,business_id,name,categories
3125,bPVPZeqMuYtU_yGCb4P-jg,Empty Bowl Gourmet Noodle Bar,"comfort food, asian fusion, chinese, tapas/sma..."
359,I0jFTYVDs-WHwKR2XHQlRA,Zen Yai Thai Cuisine,"thai, restaurants, american (new), pop-up rest..."
3201,Xw_5QkLTyIEg6Hs-kXGNbg,Galanga Thai Restaurant,"soup, thai, restaurants, noodles"
684,cZ6sHHLv7VSmuFSxP6sQLg,Lao Wang,"noodles, restaurants, gluten-free, creperies, ..."
16,18eWJFJbXyR9j_5xfcRLYA,Siam Elephant,"restaurants, thai"


# Experiments metrics for CF

In [23]:
# process train df
# Filter users with at least 5 ratings to save on memory
user_counts = test_data['user_id'].value_counts()
test_filtered_df = test_data[test_data['user_id'].isin(user_counts[user_counts >= 5].index)]

# Filter restaurants with at least 5 ratings
item_counts = test_filtered_df['business_id'].value_counts()
test_filtered_df = test_filtered_df[test_filtered_df['business_id'].isin(item_counts[item_counts >= 5].index)]

# Recompute User-Restaurant interaction matrix using filtered_df
test_user_restaurant_matrix = test_filtered_df.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)

# Fill missing values with 0 (indicating no rating for that business)
test_user_restaurant_matrix = test_user_restaurant_matrix.fillna(0)

In [25]:
test_user_similarity = cosine_similarity(test_user_restaurant_matrix)

test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_restaurant_matrix.index, columns=test_user_restaurant_matrix.index)


In [40]:
recommendations = {}
for user_id in test_user_restaurant_matrix.index:
    recs = recommend_businesses(user_id=user_id, user_similarity_df=train_user_similarity_df, user_item_matrix=train_user_restaurant_matrix)
    recommendations[user_id] = recs

not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user


In [42]:
recommendations

{'-1WbN1Qd-opw8u3uEqs2Kg': Index(['cnG65HES0L_jCgbrWKtnVg', 'EPkF-beh6gg_s3HtlAdfiw',
        '5Z7HXqp22lSiN0Q8iF94hQ', '7iQ93RWLLYOa7QRMcHgEtA',
        'WhTRPUduXy3mRBYlmSmsKw'],
       dtype='object', name='business_id'),
 '-7Eh_8y1ihj3nNtdIetiRA': Index(['qUH2FppY7RL96F5tXzA0hQ', 'bA0vBB4ztXjzM4SVA8balQ',
        'O3jPfTDRn5_1O3ZM_LuxQw', '1oGCzoCC6HnQWpdK9cCP0g',
        'AGtZA_0tR0erViKGTCGtQg'],
       dtype='object', name='business_id'),
 '-B-QEUESGWHPE_889WJaeg': Index(['bXrQejeR66IwebuhxPS7GA', 'qM2gfO-cqpDzxmIX-XzuWA',
        'edJoBsse6nsF0BYh6pATAg', 'uZYBF0YTU_iKq4_L_PDtMQ',
        'iBTc6XKgW5HESEZIii-G4g'],
       dtype='object', name='business_id'),
 '-BB9oz-WY-H2oBazb4Ltzw': Index(['Rl42JbSMsmNW3LRjsTMYAg', 'LlGIlNJE2Nv_PXkH7l4Wmg',
        'CYwC5OjwCN6ib_AQDxi3Ow', '5Z7HXqp22lSiN0Q8iF94hQ',
        'edJoBsse6nsF0BYh6pATAg'],
       dtype='object', name='business_id'),
 '-Oqfoc4KyhzYY0qvSkxkrg': Index(['BttSAciQNIQhgJCv3-FYpw', '5vObJIHLO-QOMrw_MvC4rA',
        'wJiaL

In [43]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(recommended, actual):
    # Convert to sets for comparison
    recommended_set = set(recommended)
    actual_set = set(actual)

    # Precision: How many of the recommended businesses are relevant?
    true_positives = len(recommended_set & actual_set)
    precision = true_positives / len(recommended_set) if len(recommended_set) > 0 else 0

    # Recall: How many relevant businesses were recommended?
    recall = true_positives / len(actual_set) if len(actual_set) > 0 else 0

    # F1 Score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Function to get actual relevant businesses for a user in the test set (rating >= 4)
def get_relevant_businesses(user_id, test_user_item_matrix, threshold=4):
    return test_user_item_matrix.loc[user_id][test_user_item_matrix.loc[user_id] >= threshold].index

# Evaluate precision, recall, and F1 score for each user
precision_list = []
recall_list = []
f1_list = []

for user_id, recs in recommendations.items():
    relevant_businesses = get_relevant_businesses(user_id, test_user_restaurant_matrix)
    
    precision, recall, f1 = calculate_metrics(recs, relevant_businesses)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Calculate average precision, recall, and F1 score
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)

print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

Average Precision: 0.0375
Average Recall: 0.0338
Average F1 Score: 0.0323


# metrics for CF using spearman instead

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

# train_user_similarity = cosine_similarity(train_user_restaurant_matrix)

# train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)

#  between users
train_spearman_corr_matrix = train_user_restaurant_matrix.corr(method='spearman')
train_user_similarity_df = pd.DataFrame(train_spearman_corr_matrix, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)

# between restaurants
train_spearman_corr_restaurants = train_user_restaurant_matrix.T.corr(method='spearman')

In [51]:
#  between users
test_spearman_corr_matrix = train_user_restaurant_matrix.corr(method='spearman')
test_user_similarity_df = pd.DataFrame(test_spearman_corr_matrix, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)

# between restaurants
test_spearman_corr_restaurants = train_user_restaurant_matrix.T.corr(method='spearman')

In [52]:
recommendations = {}
for user_id in test_user_restaurant_matrix.index:
    recs = recommend_businesses(user_id=user_id, user_similarity_df=train_user_similarity_df, user_item_matrix=train_user_restaurant_matrix)
    recommendations[user_id] = recs

not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user


In [53]:
# Evaluate precision, recall, and F1 score for each user
precision_list = []
recall_list = []
f1_list = []

for user_id, recs in recommendations.items():
    relevant_businesses = get_relevant_businesses(user_id, test_user_restaurant_matrix)
    
    precision, recall, f1 = calculate_metrics(recs, relevant_businesses)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Calculate average precision, recall, and F1 score
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)

print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

Average Precision: 0.0116
Average Recall: 0.0146
Average F1 Score: 0.0119


## Using pearson instead

In [62]:
train_pearson_corr_matrix = train_user_restaurant_matrix.corr(method='pearson')
train_user_similarity_df = pd.DataFrame(train_pearson_corr_matrix, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)

# between restaurants
# train_spearman_corr_restaurants = train_user_restaurant_matrix.T.corr(method='pearson')

In [63]:
#  between users
test_pearson_corr_matrix = train_user_restaurant_matrix.corr(method='pearson')
test_user_similarity_df = pd.DataFrame(test_pearson_corr_matrix, index=train_user_restaurant_matrix.index, columns=train_user_restaurant_matrix.index)

In [64]:
recommendations = {}
for user_id in test_user_restaurant_matrix.index:
    recs = recommend_businesses(user_id=user_id, user_similarity_df=train_user_similarity_df, user_item_matrix=train_user_restaurant_matrix)
    recommendations[user_id] = recs

not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user
not enough info for user


In [65]:
# Evaluate precision, recall, and F1 score for each user
precision_list = []
recall_list = []
f1_list = []

for user_id, recs in recommendations.items():
    relevant_businesses = get_relevant_businesses(user_id, test_user_restaurant_matrix)
    
    precision, recall, f1 = calculate_metrics(recs, relevant_businesses)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Calculate average precision, recall, and F1 score
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)

print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

Average Precision: 0.0116
Average Recall: 0.0146
Average F1 Score: 0.0119


In [None]:
# results are exactly the same as spearman... is the correlation matrix wrong? or is it something else.