In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix, coo_matrix
import tarfile
import pandas as pd
import json
import matplotlib.pyplot as plt


# MAPPING user id and restaraunt id to index in a dictionary

In [14]:

all_data = pd.read_csv('../data/UI_matrix_n_5_r.csv')

all_data = all_data.set_index('user_id')

user_index_to_id = {index: user_id for index, user_id in enumerate(all_data.index)}
user_id_to_index = {user_id: index for index, user_id in user_index_to_id.items()}

#user_index_to_id[0] would return the first user id
#user_id_to_index["-1-ECBsGpG4Iw5s-ecnfqw"] would return 0

item_index_to_id = {index: business_id for index, business_id in enumerate(all_data.columns)}
item_id_to_index = {business_id: index for index, business_id in item_index_to_id.items()}

all_data.rename(columns=item_id_to_index, inplace=True)

# Map each user_id in the index to its corresponding index number
new_index = all_data.index.map(user_id_to_index)

# Set this new index to the DataFrame
all_data.set_index(new_index, inplace=True)

all_data.index.name = None


all_data



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,5.0,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,4.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6504,,,,,,,,,,,...,,,,,,,,,,
6505,,,,,,,,,,,...,,,,,,,,,,
6506,,,,,,,,,,,...,,,,,,,,,,
6507,,,,,,,,,,,...,,,,,,,,,,


# MAPPING resturaunt name to id and vice versa

In [15]:
with open("../data/yelp_dataset/yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:
        data_business = [json.loads (line) for line in file]

df_business = pd.DataFrame(data_business)

user_item_matrix = pd.read_csv('../data/UI_matrix_n_5_r.csv')


# Assuming the first column in user_item_matrix is not a business ID, we exclude it
business_ids = user_item_matrix.columns[1:]

# Filter df_business to keep only rows where business_id is in the list of business_ids
filtered_df_business = df_business[df_business['business_id'].isin(business_ids)]

id_to_restaurant = filtered_df_business.set_index('business_id')['name'].to_dict()

restaurant_to_id = {value: key for key, value in id_to_restaurant.items()}


len(restaurant_to_id)
len(id_to_restaurant)

1153

# Retrain model on all data given best hyper paramters


In [16]:
#prepping dataset for matrix factorization
all_data.fillna(0, inplace=True)

all_data_sparse = coo_matrix(all_data.to_numpy())


In [17]:

def matrix_factorization_SGD(R, K, alpha, beta, iterations):

    np.random.seed(42)  
    
    num_users, num_items = R.shape
    P = np.random.rand(num_users, K)
    Q = np.random.rand(num_items, K)

    R_coo = coo_matrix(R)
    non_zero_ratings = list(zip(R_coo.row, R_coo.col, R_coo.data))

    for it in range(iterations):
        np.random.shuffle(non_zero_ratings)
        for user_idx, item_idx, r in non_zero_ratings:
            prediction = np.dot(P[user_idx, :], Q[item_idx, :].T)
            e = r - prediction
            P[user_idx, :] += alpha * (e * Q[item_idx, :] - beta * P[user_idx, :])
            Q[item_idx, :] += alpha * (e * P[user_idx, :] - beta * Q[item_idx, :])

    return P, Q.T


#these hyperparameters were identified as ideal via bayesion optimization in script 3
K = 18
alpha = 0.005273041694145949
beta = 0.18189763744227455
iterations = 4


P, Q = matrix_factorization_SGD(all_data_sparse, K, alpha, beta, iterations)

# definition function to convert restaurants to ids 

In [18]:

def get_liked_business_ids(liked_restaurants):


    liked_business_ids = []
    for restaurant_name in liked_restaurants:
        if restaurant_name in restaurant_to_id:
            liked_business_ids.append(restaurant_to_id[restaurant_name])
        else:
            print(f"Restaurant '{restaurant_name}' not found in the dataset.")

    return liked_business_ids


# generating a user_ratings vector based on the user's liked restaraunts

In [19]:
def get_user_ratings_vector(liked_restaurants):


    # Initialize the user ratings vector with NaNs
    user_ratings = pd.Series(np.nan, index=user_item_matrix.columns[1:])

    liked_business_ids = get_liked_business_ids(liked_restaurants)


    # Set ratings for liked businesses
    for business_id in liked_business_ids:
        if business_id in user_ratings.index:
            user_ratings[business_id] = 5  # Set the rating as 5 for liked businesses


    user_ratings_df = user_ratings.to_frame(name='rating')

    user_ratings_df_T = user_ratings_df.T

    return  user_ratings_df_T



# generating latent feature vector for new user


    Infers the latent features for a new user based on their ratings.

    Parameters:
    user_ratings (DataFrame): The ratings given by the new user, NaN for unrated items.
    item_features (array): The item feature matrix (Q) from matrix factorization.
    learning_rate (float): The learning rate for SGD.
    iterations (int): Number of iterations for the optimization process.

    Returns:
    array: The inferred latent features of the new user.


In [20]:


def infer_new_user_features(liked_restaurants, item_features = Q, learning_rate=0.01, iterations=100):


    user_ratings = get_user_ratings_vector(liked_restaurants)

    num_features = item_features.shape[0]  # Number of latent features
    user_features = np.random.rand(num_features)  # Randomly initialize user features

    for _ in range(iterations):
        for i in range(item_features.shape[1]):  # Iterate through items
            rating = user_ratings.iloc[0, i]  # Access the rating using .iloc for DataFrame
            if not np.isnan(rating):  # Only consider rated items
                error = rating - np.dot(user_features, item_features[:, i])
                user_features += learning_rate * (error * item_features[:, i] - 0.02 * user_features)  # Regularization

    return user_features


# Implementing hybrid model


In [21]:

def hybrid_reccomendations(keyword, liked_restaurants, n):

    new_user_features = infer_new_user_features(liked_restaurants)

    predicted_ratings = np.dot(new_user_features, Q)

    # Sorting the predicted ratings in descending order and obtaining the indices. This is a list of indicies of top reccomendations in item feature matrix (Q). The first 10 elements are the top 10 reccomendations 
    restaurant_indices_sorted  = np.argsort(predicted_ratings)[::-1]   

    #converting reccomended restaraunt indices to restaraunt ids
    recommended_business_ids = [item_index_to_id[idx] for idx in restaurant_indices_sorted]


    desired_atmosphere = keyword

    # Open the JSON file for reading
    with open('../data/keywords_restaurants.json', 'r') as file:
    # Parse the JSON file and convert it into a Python dictionary
        keyword_to_restaurant_ids = json.load(file)


    #Filtering for restaurants in the dataset that contain that keyword
    suitable_restaurants = keyword_to_restaurant_ids[desired_atmosphere]

    
    # Filtering recommended_business_ids to include only those that contain the keyword
    filtered_recommendations = [id for id in recommended_business_ids if id in suitable_restaurants]

    #converting reccomended restaraunt ids to names
    recommended_business_names = [id_to_restaurant[idx] for idx in filtered_recommendations]

    # getting the top n reccomendations only 
    top_n_recommended_business_names = recommended_business_names[:n]


    return top_n_recommended_business_names


# Reccomendations! Input known liked restaurants in the Santa Barbara Area, run the cell, and input desired atmosphere.

In [32]:
# Example: User inputs
liked_restaurants = ["Lily's Taco", "Himalayan Kitchen", "Freebirds", "Barbareño", "Habit Burger", "Padaro Beach Grill", "Thario’s","Tre Lune"]

#Example: User inputs:
desired_atmosphere = input("What is your desired atmosphere? Type in one of the following: romantic, intimate,  anniversary, birthday, friends, unique, celebration, live music, outdoor seating, upscale, rooftop, waterfront, scenic, hidden gem, cozy, affordable, fine-dining, dimly lit ")

reccomendations = hybrid_reccomendations(desired_atmosphere, liked_restaurants, 10)


print("----------------------------")

print(f"Here are some {desired_atmosphere} restaurants that you will enjoy: {reccomendations}")



Restaurant 'Lily's Taco' not found in the dataset.
Restaurant 'Habit Burger' not found in the dataset.
Restaurant 'Thario’s' not found in the dataset.
----------------------------
Here are some cozy restaurants that you will enjoy: ["Rascal's Vegan Food", 'Ty Lounge', 'Apero', 'Events By Rincon', 'The Set', 'Lucca Truck', 'Restaurant Mimosa', 'Alessia Patisserie & Cafe ', 'Le Café', 'Aperitivo']
