In [1]:
import pandas as pd
import numpy as np
# source data: https://www.kaggle.com/uciml/restaurant-data-with-consumer-ratings
df = pd.read_csv('/Users/DPalinggi/OneDrive - indikaenergy/PET PROJECTS/Recommender System/rating_final.csv')
df_location = pd.read_csv('/Users/DPalinggi/OneDrive - indikaenergy/PET PROJECTS/Recommender System/geoplaces2.csv')
data = pd.merge(df, df_location, how='left', on=['placeID'])
data = data[['userID','name','rating']]

In [2]:
data

Unnamed: 0,userID,name,rating
0,U1077,Tortas Locas Hipocampo,2
1,U1077,Restaurant la Chalita,2
2,U1077,puesto de tacos,2
3,U1077,Restaurante Marisco Sam,1
4,U1068,vips,1
...,...,...,...
1156,U1043,palomo tec,1
1157,U1011,tacos de la estacion,1
1158,U1068,Little Cesarz,1
1159,U1068,tacos de barbacoa enfrente del Tec,1


In [3]:
data['rating'].unique()

array([2, 1, 0])

## Train the SVD model 

In [4]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split


# Set rating scale between 0 to 2
reader = Reader(rating_scale=(0, 2))

In [5]:
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(data, reader)

In [6]:
# keep 25% of your trainset for testing
trainset, testset = train_test_split(data, test_size=.25)

In [7]:
# train a new SVD with 100 latent features
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x117f5cc90>

In [8]:
model.qi.shape

(128, 100)

## Identifying Certain Restaurant 

In [9]:
item_to_row_idx = model.trainset._raw2inner_id_items

In [10]:
restaurant_row_idx : int = item_to_row_idx['puesto de tacos']

In [11]:
model.qi[restaurant_row_idx]

array([-0.06482757,  0.0464706 , -0.08641418,  0.08569937,  0.09121836,
        0.0366324 ,  0.03448812,  0.11941822, -0.07333014, -0.01700724,
        0.12394586,  0.12528807,  0.00427896,  0.13780554,  0.06194164,
        0.10269031,  0.02224996,  0.18139599, -0.15381884, -0.01220896,
       -0.23125479, -0.00081094,  0.00117855,  0.24475117, -0.12044532,
        0.03084069, -0.03716773,  0.05842001, -0.02344612, -0.04281169,
        0.01910757,  0.08916615, -0.02874976, -0.00447974,  0.01541525,
        0.12151091,  0.06392766,  0.03159135,  0.04725584, -0.0808508 ,
       -0.04993413,  0.00767312, -0.05813281, -0.01781769,  0.13240371,
       -0.12734975, -0.1036991 ,  0.00454455,  0.08014567,  0.12312516,
        0.09424687,  0.10080174,  0.0079399 , -0.09313194,  0.03377077,
       -0.21246711,  0.05239199,  0.0861674 ,  0.07441308, -0.02881045,
        0.06775418, -0.16545758,  0.07024731,  0.0852682 ,  0.04108448,
       -0.09977365, -0.09252084,  0.08782523,  0.00927094,  0.12

In [12]:
print(f"Every restaurant has {model.qi[restaurant_row_idx].shape[0]} features")

Every restaurant has 100 features


## Recommendation via Matrix Reconstruction 

### Predict a rating between any combination of user and restaurant

In [13]:
a_user = "U1043"
a_restaurant = "puesto de tacos"
model.predict(a_user, a_restaurant)

Prediction(uid='U1043', iid='puesto de tacos', r_ui=None, est=1.215769357936859, details={'was_impossible': False})

So based on the model, user U1043 will give 1.2 rating to puesto de tacos

## Recommendation via Item Similarity

In [14]:
from scipy.spatial.distance import cosine


def get_vector_by_restaurant_name(restaurant_name: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a restaurant in the form of a numpy array"""
    restaurant_row_idx = trained_model.trainset._raw2inner_id_items[restaurant_name]
    return trained_model.qi[restaurant_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

### Recommendations via Item Similarity: Finding similarity between vectors 

In [15]:
# Fetch indices for 3 random profile_id
id_1_idx = model.trainset._raw2inner_id_items['puesto de tacos']
id_2_idx = model.trainset._raw2inner_id_items['tacos de la estacion']
id_3_idx = model.trainset._raw2inner_id_items['Little Cesarz']

# Get vectors for those 3 profile_id
id_1_vector = model.qi[id_1_idx]
id_2_vector = model.qi[id_2_idx]
id_3_vector = model.qi[id_3_idx]

In [16]:
# Distance between two profiles_id
cosine_distance(id_1_vector, id_2_vector)

1.0468584519935034

In [17]:
cosine_distance(id_1_vector, id_3_vector)

0.8009875763445398

From result above we can say that puesto de tacos has more similarity to tacos de la estacion than Little Cesarz

### Finding similar restaurant by ranking

In [18]:
def display(similarity_table):
    similarity_table = pd.DataFrame(
        similarity_table,
        columns=['vector cosine distance', 'restaurant name']
    ).sort_values('vector cosine distance', ascending=True)
    return similarity_table.iloc[:4]

In [19]:
def get_top_similarities(restaurant_name: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar restaurant to a specified restaurant
    
    """
    
    # Get the first restaurant vector
    restaurant_vector: np.array = get_vector_by_restaurant_name(restaurant_name, model)
    similarity_table = []
    
    # Iterate over every possible restaurant and calculate similarity
    for other_restaurant_name in model.trainset._raw2inner_id_items.keys():
        other_restaurant_vector = get_vector_by_restaurant_name(other_restaurant_name, model)
        
        # Get the second restaurant vector, and calculate distance
        similarity_score = cosine_distance(other_restaurant_vector, restaurant_vector)
        similarity_table.append((similarity_score, other_restaurant_name))
    
    # sort restaurant by ascending similarity
    return display(sorted(similarity_table))

In [20]:
get_top_similarities('Tortas Locas Hipocampo', model)

Unnamed: 0,vector cosine distance,restaurant name
0,0.0,Tortas Locas Hipocampo
1,0.751496,KFC
2,0.760581,Restaurant los Compadres
3,0.766727,Mariscos Tia Licha
