In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix, coo_matrix
import tarfile
import pandas as pd
import json
import matplotlib.pyplot as plt


# MAPPING user id and restaraunt id to index in a dictionary

In [2]:

all_data = pd.read_csv('../data/UI_matrix_n_5_r.csv')

all_data = all_data.set_index('user_id')

user_index_to_id = {index: user_id for index, user_id in enumerate(all_data.index)}
user_id_to_index = {user_id: index for index, user_id in user_index_to_id.items()}

#user_index_to_id[0] would return the first user id
#user_id_to_index["-1-ECBsGpG4Iw5s-ecnfqw"] would return 0

item_index_to_id = {index: business_id for index, business_id in enumerate(all_data.columns)}
item_id_to_index = {business_id: index for index, business_id in item_index_to_id.items()}

all_data.rename(columns=item_id_to_index, inplace=True)

# Map each user_id in the index to its corresponding index number
new_index = all_data.index.map(user_id_to_index)

# Set this new index to the DataFrame
all_data.set_index(new_index, inplace=True)

all_data.index.name = None


all_data



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,5.0,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,4.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6504,,,,,,,,,,,...,,,,,,,,,,
6505,,,,,,,,,,,...,,,,,,,,,,
6506,,,,,,,,,,,...,,,,,,,,,,
6507,,,,,,,,,,,...,,,,,,,,,,


# MAPPING resturaunt name to id and vice versa

In [4]:
with open("../data/yelp_dataset/yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:
        data_business = [json.loads (line) for line in file]

df_business = pd.DataFrame(data_business)

user_item_matrix = pd.read_csv('../data/UI_matrix_n_5_r.csv')


# Assuming the first column in user_item_matrix is not a business ID, we exclude it
business_ids = user_item_matrix.columns[1:]

# Filter df_business to keep only rows where business_id is in the list of business_ids
filtered_df_business = df_business[df_business['business_id'].isin(business_ids)]

id_to_restaurant = filtered_df_business.set_index('business_id')['name'].to_dict()

restaurant_to_id = {value: key for key, value in id_to_restaurant.items()}


len(restaurant_to_id)
len(id_to_restaurant)

1153

# Retrain model on all data given best hyper paramters


In [5]:
# train_data = pd.read_csv('../data/train_data_n5_r.csv')

# test_data  = pd.read_csv('../data/test_data_n5_r.csv')


# # Replace NaN values with zeros
# train_data.fillna(0, inplace=True)
# test_data.fillna(0, inplace=True)

all_data.fillna(0, inplace=True)


# #creating a sparse representation of training / testing data
# train_data_sparse = coo_matrix(train_data.to_numpy())

# test_data_sparse = coo_matrix(test_data.to_numpy())

all_data_sparse = coo_matrix(all_data.to_numpy())



In [6]:

def matrix_factorization_SGD(R, K, alpha, beta, iterations):

    np.random.seed(42)  
    
    num_users, num_items = R.shape
    P = np.random.rand(num_users, K)
    Q = np.random.rand(num_items, K)

    R_coo = coo_matrix(R)
    non_zero_ratings = list(zip(R_coo.row, R_coo.col, R_coo.data))

    for it in range(iterations):
        np.random.shuffle(non_zero_ratings)
        for user_idx, item_idx, r in non_zero_ratings:
            prediction = np.dot(P[user_idx, :], Q[item_idx, :].T)
            e = r - prediction
            P[user_idx, :] += alpha * (e * Q[item_idx, :] - beta * P[user_idx, :])
            Q[item_idx, :] += alpha * (e * P[user_idx, :] - beta * Q[item_idx, :])

    return P, Q.T

# # Assuming R_train is your training matrix and is in sparse format
# K = 12  # Number of latent factors
# alpha = 0.001403450260491775
# beta= 0.05933345569703825
# iterations = 43

K = 18
alpha = 0.005273041694145949
beta = 0.18189763744227455
iterations = 4



P, Q = matrix_factorization_SGD(all_data_sparse, K, alpha, beta, iterations)

# User inputs known liked restaurants below:

In [7]:
# Example: User inputs
liked_restaurants = ["Lily's Taco", "Himalayan Kitchen", "Freebirds", "Barbareño", "Habit Burger", "Padaro Beach Grill", "Thario’s","Tre Lune"]

#input desired atmoshpere. options = ["romantic", "intimate", "date", "birthday", "friends", "date-night", "anniversary", "unique", "celebration", "music"]

#Example: User inputs:
desired_atmosphere = ["romantic"]

In [8]:



liked_business_ids = []
for restaurant_name in liked_restaurants:
    if restaurant_name in restaurant_to_id:
        liked_business_ids.append(restaurant_to_id[restaurant_name])
    else:
        print(f"Restaurant '{restaurant_name}' not found in the dataset.")


liked_business_ids

Restaurant 'Lily's Taco' not found in the dataset.
Restaurant 'Habit Burger' not found in the dataset.
Restaurant 'Thario’s' not found in the dataset.


['BwUWmUFKCuJ-nN9vFwMt2Q',
 'Rl42JbSMsmNW3LRjsTMYAg',
 '6HTGlttrzCMsuGBHO1ZGiw',
 '1FURjeGJi_LBXcJQg8eskw',
 'S3QHy1sshUeZwXOYviVsXQ']

# generating a user_ratings vector based on the user's liked restaraunts

In [9]:
# Initialize the user ratings vector with NaNs
user_ratings = pd.Series(np.nan, index=user_item_matrix.columns[1:])

# Set ratings for liked businesses
for business_id in liked_business_ids:
    if business_id in user_ratings.index:
        user_ratings[business_id] = 5  # Set the rating as 5 for liked businesses


user_ratings_df = user_ratings.to_frame(name='rating')

user_ratings_df_T = user_ratings_df.T

# # Check ratings in the DataFrame
# for business_id in liked_business_ids:
#     print(f"Rating for {business_id} in DataFrame: {user_ratings_df.loc[business_id, 'rating']}")


# len(user_ratings_df)

# generating latent feature vector for new user


    Infers the latent features for a new user based on their ratings.

    Parameters:
    user_ratings (DataFrame): The ratings given by the new user, NaN for unrated items.
    item_features (array): The item feature matrix (Q) from matrix factorization.
    learning_rate (float): The learning rate for SGD.
    iterations (int): Number of iterations for the optimization process.

    Returns:
    array: The inferred latent features of the new user.


In [10]:


def infer_new_user_features(user_ratings, item_features, learning_rate=0.01, iterations=100):
    
    num_features = item_features.shape[0]  # Number of latent features
    user_features = np.random.rand(num_features)  # Randomly initialize user features

    for _ in range(iterations):
        for i in range(item_features.shape[1]):  # Iterate through items
            rating = user_ratings.iloc[0, i]  # Access the rating using .iloc for DataFrame
            if not np.isnan(rating):  # Only consider rated items
                error = rating - np.dot(user_features, item_features[:, i])
                user_features += learning_rate * (error * item_features[:, i] - 0.02 * user_features)  # Regularization

    return user_features


new_user_features = infer_new_user_features(user_ratings_df_T, Q)


# predicting ratings

    Predict ratings for each item based on the user's features.

    Parameters:
    user_features (array): The inferred latent features of the user.
    item_features (array): The item feature matrix (Q) from matrix factorization.

    Returns:
    array: Predicted ratings for each item.
    

In [11]:
def predict_ratings(user_features, item_features):

    predicted_ratings = np.dot(user_features, item_features)
    return predicted_ratings #essentially returns a nwe row in a user_item matrix with predicted ratings for every restaraunt  


def reccomended_restaraunts(n):

    # Predict ratings for all restaurants
    predicted_ratings = predict_ratings(new_user_features, Q)

    # Get the indices (restaurant IDs) of the top N predictions
    top_n_recommendations = np.argsort(predicted_ratings)[::-1][:n]   #represents the indices of the restaurants in item feature matrix (Q) that are the top recommendations for the user.
    # 'top_n_recommendations' is your list of top indices from the recommendation model

    #converting reccomended restaraunt indices to restaraunt ids
    recommended_business_ids = [item_index_to_id[idx] for idx in top_n_recommendations]

    #converting reccomended restaraunt ids to names
    recommended_business_names = [id_to_restaurant[idx] for idx in recommended_business_ids]

    return recommended_business_names


In [12]:
reccomended_restaraunts(10)

['Carrillo Dining Commons',
 "Maudet's",
 'European Deli',
 'Palihouse Santa Barbara',
 'Baba Small Batch',
 'The Mill',
 'Red Pepper Chinese Food Express',
 'The Beach Grill at Padaro',
 "L's Kitchen",
 'Le Bon Cafe']

# Implementing hybrid model -- Note: Code below was used as a rough draft. See script 6 for final version of hybrid model.


In [13]:
def predict_ratings(user_features, item_features):

    predicted_ratings = np.dot(user_features, item_features)
    return predicted_ratings #essentially returns a nwe row in a user_item matrix with predicted ratings for every restaraunt  



def hybrid_reccomendations(keyword, n):
    desired_atmosphere = keyword

    # Open the JSON file for reading
    with open('../data/keywords_restaurants.json', 'r') as file:
    # Parse the JSON file and convert it into a Python dictionary
        keyword_to_restaurant_ids = json.load(file)

    #Filtering for restaurants in the dataset that contain that keyword
    suitable_restaurants = keyword_to_restaurant_ids[desired_atmosphere]


    #Predict Scores for All Restaurants from matrix factorization model
    predicted_ratings = predict_ratings(new_user_features, Q)


    # Sorting the predicted ratings in descending order and obtaining the indices. This is a list of indicies of top reccomendations in item feature matrix (Q). The first 10 elements are the top 10 reccomendations 
    restaurant_indices_sorted  = np.argsort(predicted_ratings)[::-1]   

    #converting reccomended restaraunt indices to restaraunt ids
    recommended_business_ids = [item_index_to_id[idx] for idx in restaurant_indices_sorted]

    # Filtering recommended_business_ids to include only those that contain the keyword
    filtered_recommendations = [id for id in recommended_business_ids if id in suitable_restaurants]

    #converting reccomended restaraunt ids to names
    recommended_business_names = [id_to_restaurant[idx] for idx in filtered_recommendations]

    # getting the top n reccomendations only 
    top_n_recommended_business_names = recommended_business_names[:n]


    return top_n_recommended_business_names




print(hybrid_reccomendations('rooftop', 10))



['Book Ends Cafe', "Yoichi's", 'Barbareño', 'Modern Times Academy of Recreational Sciences', 'Finch & Fork', 'Oku', 'The Ritz-Carlton Bacara, Santa Barbara', 'The Honey B', 'Crushcakes & Cafe', 'Stearns Wharf']


In [14]:
# Example: User inputs
liked_restaurants = ["Lily's Taco", "Himalayan Kitchen", "Freebirds", "Barbareño", "Habit Burger", "Padaro Beach Grill", "Thario’s","Tre Lune"]


#Example: User inputs:
desired_atmosphere = "romantic"
#options = ["romantic", "intimate", "date", "birthday", "friends", "date-night", "anniversary", "unique", "celebration", "music"]




print(hybrid_reccomendations(desired_atmosphere, 10))

['Khao Kaeng by Empty Bowl Gourmet Noodle Bar', 'Sevilla', 'University Club of Santa Barbara', 'OPPI’Z Bistro And Natural Pizza', "Downey's", 'Bar 29', 'The Revere Room', "Ruth's Chris Steak House", 'Tydes Restaurant', 'Intermezzo By Wine Cask']


# Verifying reccomendations worked

In [17]:
def predict_ratings(user_features, item_features):

    predicted_ratings = np.dot(user_features, item_features)
    return predicted_ratings #essentially returns a nwe row in a user_item matrix with predicted ratings for every restaraunt  



def hybrid_reccomendations(keyword, n):
    desired_atmosphere = keyword


    # Open the JSON file for reading
    with open('../data/keywords_restaurants.json', 'r') as file:
    # Parse the JSON file and convert it into a Python dictionary
        keyword_to_restaurant_ids = json.load(file)

    #Filtering for restaurants in the dataset that contain that keyword
    suitable_restaurants = keyword_to_restaurant_ids[desired_atmosphere]


    #Predict Scores for All Restaurants from matrix factorization model
    predicted_ratings = predict_ratings(new_user_features, Q)


    # Sorting the predicted ratings in descending order and obtaining the indices. This is a list of indicies of top reccomendations in item feature matrix (Q). The first 10 elements are the top 10 reccomendations 
    restaurant_indices_sorted  = np.argsort(predicted_ratings)[::-1]   

    #converting reccomended restaraunt indices to restaraunt ids
    recommended_business_ids = [item_index_to_id[idx] for idx in restaurant_indices_sorted]

    # Filtering recommended_business_ids to include only those that contain the keyword
    filtered_recommendations = [id for id in recommended_business_ids if id in suitable_restaurants]



    # getting the top n reccomendations only 
    top_n_recommended_business_names = filtered_recommendations[:n]


    return top_n_recommended_business_names



print(hybrid_reccomendations('live music', 1))

['aYMpjij5ShtEoZueMrQPRw']


In [18]:
# Open the JSON file for reading
with open('../data/keywords_restaurants.json', 'r') as file:
# Parse the JSON file and convert it into a Python dictionary
    keyword_to_restaurant_ids = json.load(file)


# Assuming your dictionary is named keyword_to_restaurant_ids
keyword = 'live music'
restaurant_id = 'aYMpjij5ShtEoZueMrQPRw'

# Check if the restaurant ID is in the list of IDs for the specified keyword
is_in_romantic = restaurant_id in keyword_to_restaurant_ids.get(keyword, [])

print(is_in_romantic)


True


### this was a quick test to verify that the reccomended businesses was associated with the requested keyword