In [1]:
import pandas as pd
import numpy as np
# source data: https://www.kaggle.com/uciml/restaurant-data-with-consumer-ratings
df = pd.read_csv('/Users/DPalinggi/OneDrive - indikaenergy/PET PROJECTS/Recommender System/rating_final.csv')
df_location = pd.read_csv('/Users/DPalinggi/OneDrive - indikaenergy/PET PROJECTS/Recommender System/geoplaces2.csv')
data = pd.merge(df, df_location, how='left', on=['placeID'])
data = data[['userID','name','rating']]

In [2]:
data

Unnamed: 0,userID,name,rating
0,U1077,Tortas Locas Hipocampo,2
1,U1077,Restaurant la Chalita,2
2,U1077,puesto de tacos,2
3,U1077,Restaurante Marisco Sam,1
4,U1068,vips,1
...,...,...,...
1156,U1043,palomo tec,1
1157,U1011,tacos de la estacion,1
1158,U1068,Little Cesarz,1
1159,U1068,tacos de barbacoa enfrente del Tec,1


In [3]:
data['rating'].unique()

array([2, 1, 0])

## Train the SVD model 

In [4]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split


# Set rating scale between 0 to 2
reader = Reader(rating_scale=(0, 2))

In [5]:
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(data, reader)

In [6]:
# keep 25% of your trainset for testing
trainset, testset = train_test_split(data, test_size=.25)

In [7]:
# train a new SVD with 100 latent features
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1192e9890>

In [8]:
model.qi.shape

(129, 100)

## Identifying Certain Restaurant 

In [9]:
item_to_row_idx = model.trainset._raw2inner_id_items

In [10]:
restaurant_row_idx : int = item_to_row_idx['puesto de tacos']

In [11]:
model.qi[restaurant_row_idx]

array([-0.06996235, -0.11050709, -0.08535958, -0.26132919, -0.09304525,
        0.02136104,  0.00072789, -0.07916805, -0.00750625,  0.02128208,
        0.03173453, -0.07122026, -0.00772305,  0.23108256, -0.03265572,
        0.11869963, -0.07110369, -0.03856744,  0.07628338, -0.26168568,
        0.17456632,  0.20758931,  0.01741186,  0.10830809, -0.25560216,
       -0.03749603, -0.06791776,  0.01020548,  0.09120739, -0.15948837,
       -0.11915999, -0.10412451,  0.09506549,  0.04381587, -0.05135084,
       -0.01958727,  0.02347313,  0.05014179, -0.0199047 , -0.1529212 ,
        0.05952342,  0.0122865 ,  0.04970749,  0.1103009 ,  0.11261657,
        0.01210724,  0.03845564,  0.06595873, -0.13994793,  0.0879375 ,
        0.05613562,  0.18047029,  0.01040207, -0.23587283,  0.03571689,
        0.35168202,  0.03805785, -0.0431482 , -0.04922971,  0.0178598 ,
        0.0469731 , -0.02378197,  0.03180127, -0.11215197, -0.05839327,
       -0.09912808, -0.03265221,  0.04325252, -0.0371544 , -0.18

In [12]:
print(f"Every restaurant has {model.qi[restaurant_row_idx].shape[0]} features")

Every restaurant has 100 features


## Recommendation via Matrix Reconstruction 

### Predict a rating between any combination of user and restaurant

In [13]:
a_user = "U1043"
a_restaurant = "puesto de tacos"
model.predict(a_user, a_restaurant)

Prediction(uid='U1043', iid='puesto de tacos', r_ui=None, est=1.1705585966297143, details={'was_impossible': False})

So based on the model, user U1043 will give 1.2 rating to puesto de tacos

## Recommendation via Item Similarity

In [14]:
from scipy.spatial.distance import cosine


def get_vector_by_restaurant_name(restaurant_name: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a restaurant in the form of a numpy array"""
    restaurant_row_idx = trained_model.trainset._raw2inner_id_items[restaurant_name]
    return trained_model.qi[restaurant_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

### Recommendations via Item Similarity: Finding similarity between vectors 

In [15]:
# Fetch indices for 3 random profile_id
id_1_idx = model.trainset._raw2inner_id_items['puesto de tacos']
id_2_idx = model.trainset._raw2inner_id_items['tacos de la estacion']
id_3_idx = model.trainset._raw2inner_id_items['Little Cesarz']

# Get vectors for those 3 profile_id
id_1_vector = model.qi[id_1_idx]
id_2_vector = model.qi[id_2_idx]
id_3_vector = model.qi[id_3_idx]

In [16]:
# Distance between two profiles_id
cosine_distance(id_1_vector, id_2_vector)

1.1646743682662306

In [17]:
cosine_distance(id_1_vector, id_3_vector)

1.235096822455009

From result above we can say that puesto de tacos has more similarity to tacos de la estacion than Little Cesarz

### Finding similar restaurant by ranking

In [18]:
def get_top_similarities(restaurant_name: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first restaurant vector
    restaurant_vector: np.array = get_vector_by_restaurant_name(restaurant_name, model)
    similarity_table = []
    
    # Iterate over every possible restaurant and calculate similarity
    for other_restaurant_name in model.trainset._raw2inner_id_items.keys():
        other_restaurant_vector = get_vector_by_restaurant_name(other_restaurant_name, model)
        
        # Get the second restaurant vector, and calculate distance
        similarity_score = cosine_distance(other_restaurant_vector, restaurant_vector)
        similarity_table.append((similarity_score, other_restaurant_name))
    
    # sort restaurant by ascending similarity
    return display(sorted(similarity_table))

In [19]:
get_top_similarities('Tortas Locas Hipocampo', model)

[(0.0, 'Tortas Locas Hipocampo'),
 (0.7633109883373155, 'la Cochinita Pibil Restaurante Yucateco'),
 (0.7793092663427041, 'Gorditas Doa Gloria'),
 (0.7875319059515782, 'La Posada del Virrey'),
 (0.7998526995224852, 'la parroquia'),
 (0.8065624790992547, 'Cabana Huasteca'),
 (0.8196942422522094, 'Restaurante la Cantina'),
 (0.8230630899408701, 'La Cantina Restaurante'),
 (0.8337684206371934, 'palomo tec'),
 (0.8353440911387255, 'Michiko Restaurant Japones'),
 (0.8468851280978684, 'Preambulo Wifi Zone Cafe'),
 (0.8614904134410094, 'McDonalds Centro'),
 (0.8617162277904179, 'Carnitas Mata  Calle 16 de Septiembre'),
 (0.8704350681051349, 'Gorditas Dona Tota'),
 (0.8706654370345099, 'Rincon Huasteco'),
 (0.8766416936949386, 'el pueblito'),
 (0.8777004149787215, 'Restaurant and Bar and Clothesline Carlos N Charlies'),
 (0.8811854001581518, 'Sanborns Casa Piedra'),
 (0.8860403560861538, 'Restaurant Wu Zhuo Yi'),
 (0.8904689937684906, 'El angel Restaurante'),
 (0.9000284867994949, 'tacos abi')