In [2]:
import surprise
from surprise import Dataset, Reader
from surprise.similarities import pearson, pearson_baseline
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from surprise import accuracy
import numpy as np
import pandas as pd

In [3]:
ratings_df = pd.read_csv("new_sample_data.csv", usecols = ["user_id", "restaurant_id","Rating_x", "title", "clicked"])
ratings_df.head()

Unnamed: 0,user_id,restaurant_id,Rating_x,title,clicked
0,0,363,0.0,Chartreuse Moose Cappuccino Bar & Bistro,0
1,0,363,0.0,Chartreuse Moose Cappuccino Bar & Bistro,1
2,0,363,0.0,Chartreuse Moose Cappuccino Bar & Bistro,1
3,0,363,0.0,Chartreuse Moose Cappuccino Bar & Bistro,0
4,0,363,0.0,Chartreuse Moose Cappuccino Bar & Bistro,1


In [4]:
reader2 = Reader(line_format='user item rating', sep=',', rating_scale = (1,5))
tot_data2 = Dataset.load_from_df(ratings_df[["user_id", "restaurant_id", "Rating_x"]], reader=reader2)

In [5]:
trainset, testset = train_test_split(tot_data2, test_size=0.2)

In [6]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  20 

Number of items:  15 



In [7]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [8]:
trainsetfull = tot_data2.build_full_trainset()
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items, '\n')

Number of users:  20 

Number of items:  15 



In [9]:
trainsetfull_iids = list(trainsetfull.all_items())
iid_converter = lambda x: trainsetfull.to_raw_iid(x)
trainsetfull_raw_iids = list(map(iid_converter, trainsetfull_iids))

In [10]:
sim_options = {"name":"pearson", "user_based": False}

In [11]:
def return_top_similar_dataframe(similarity_matrix, raw_ids, top_x):
    length = similarity_matrix.shape[0]
    closest_ids = np.zeros((top_x,length))
    for item in range(0,length):    
        similarity_metrics = similarity_matrix[item]        
        sorted_metrics, sorted_raw_ids = zip(*sorted(zip(similarity_metrics, raw_ids)))        
        for index in range(0, top_x):
            closest_ids[index][item] = sorted_raw_ids[-2-index]
        similarity_df = pd.DataFrame()
        similarity_df['restaurant_id'] = raw_ids
        for index in range(0, top_x):
            similarity_df['similar_res_' + str(index + 1)] = closest_ids[index].astype(int)
    return similarity_df

In [12]:
id_to_name_dict = {}
res_names = ratings_df["title"]
res_ids = ratings_df["restaurant_id"]
for index in range(0, len(res_ids)):
    id_to_name_dict[str(res_ids[index])] = res_names[index]

def get_res_name_from_id(res_id):
    return id_to_name_dict[str(res_id)]

In [13]:
def save_similar_res(similarity_matrix, raw_iids, top_x):
    sim_df = return_top_similar_dataframe(similarity_matrix, raw_iids, top_x)
    for column in sim_df.columns:
        for i in range(len(sim_df[column])):
            sim_df[column][i] = get_res_name_from_id(sim_df[column][i])
    return sim_df

In [14]:
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNBaseline
model1 = KNNBasic(sim_options = sim_options, verbose = False)
model1.fit(trainsetfull)
model2 = KNNWithMeans(sim_options = sim_options, verbose = False)
model2.fit(trainsetfull)
model3 = KNNBaseline(k = 10, sim_options = sim_options, bsl_options = {"method":"sgd","learning_rate":0.00006})
model3.fit(trainsetfull)
sim_basic_df = save_similar_res(model1.sim, trainsetfull_raw_iids, 10)
sim_means_df = save_similar_res(model2.sim, trainsetfull_raw_iids, 10)
sim_base_df = save_similar_res(model3.sim, trainsetfull_raw_iids, 10)

Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


###### Cross Validation on the models

In [15]:
results_basic = cross_validate(model1, tot_data2, measures=['RMSE'], cv=3, return_train_measures=True);
results_means = cross_validate(model2, tot_data2, measures=['RMSE'], cv=3, return_train_measures=True);
results_base = cross_validate(model3, tot_data2, measures=['RMSE'], cv = 3, return_train_measures=True);

Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [16]:
results_basic_1 = cross_validate(model1, tot_data2, measures=['MAE'], cv=3, return_train_measures=True);
results_means_1 = cross_validate(model2, tot_data2, measures=['MAE'], cv=3, return_train_measures=True);
results_base_1 = cross_validate(model3, tot_data2, measures=['MAE'], cv = 3, return_train_measures=True)

Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [17]:
print("RMSE")
print(results_basic)
print(results_means)
print(results_base)
print("MAE")
print(results_basic_1)
print(results_means_1)
print(results_base_1)

RMSE
{'test_rmse': array([0.63678083, 0.54838893, 0.53947175]), 'train_rmse': array([0.51471054, 0.55801623, 0.58871097]), 'fit_time': (0.0359044075012207, 0.03291153907775879, 0.03291130065917969), 'test_time': (0.17180252075195312, 0.15356206893920898, 0.19251489639282227)}
{'test_rmse': array([0.62541256, 0.51291705, 0.6066273 ]), 'train_rmse': array([0.52935222, 0.572769  , 0.52450938]), 'fit_time': (0.036904096603393555, 0.032883405685424805, 0.03388333320617676), 'test_time': (0.17652535438537598, 0.21442794799804688, 0.16754889488220215)}
{'test_rmse': array([0.53815502, 0.66432412, 0.73069653]), 'train_rmse': array([0.54520317, 0.66390148, 0.60904702]), 'fit_time': (0.03886222839355469, 0.03989267349243164, 0.04587674140930176), 'test_time': (0.11668944358825684, 0.12566256523132324, 0.12070798873901367)}
MAE
{'test_mae': array([0.24735981, 0.22273703, 0.25354995]), 'train_mae': array([0.22576754, 0.23846987, 0.21769771]), 'fit_time': (0.04188251495361328, 0.032883644104003906,

In [118]:
# using KNNBasic
res_name = "Sushi and Noodle" 
rec1 = sim_basic_df[sim_basic_df["restaurant_id"] == res_name]
rec1 

Unnamed: 0,restaurant_id,similar_res_1,similar_res_2,similar_res_3,similar_res_4,similar_res_5,similar_res_6,similar_res_7,similar_res_8,similar_res_9,similar_res_10
9,Sushi and Noodle,Sushi and Noodle,Smitty's 100 Mile House,The Great Wok,P Bass Fish and Chips,Jake's,Blue Sky Chinese Restaurant,Farrier pub,BJ's Donuts & Eatery,Chartreuse Moose Cappuccino Bar & Bistro,Dairy Queen Grill & Chill


###### Evaluation of the model

In [19]:
pred = model1.test(testset)

In [20]:
# get top 10 predictions
from collections import defaultdict
def GetTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if(estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating)) #note parenthesis
    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]
    return topN
topn = GetTopN(pred)

In [21]:
# the rate at which new restaurants are shown to users
def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
        hits = 0
        for userID in topNPredicted.keys():
            hit = False
            for movieID, predictedRating in topNPredicted[userID]:
                if (predictedRating >= ratingThreshold):
                    hit = True
                    break
            if (hit):
                hits += 1
        return hits / numUsers
cov1 = UserCoverage(topn, 20, 0)
cov1

0.5

In [22]:
keys = []
keys = list(topn.keys())
res_id_1 = []
r1 = ratings_df[ratings_df["restaurant_id"] == 217][["user_id", "clicked", "restaurant_id"]]
for i in range(len(keys)):
    l = []
    l = topn[i]
    if len(l) > 0:
        for i in l:
            if i[0] not in res_id_1:
                res_id_1.append(i[0])
hit = 0
for i in keys:
    if (r1[r1["user_id"] == i]["clicked"].sum()) > 0:
        hit += 1
hitrate = hit/len(keys)
print(hitrate)

0.3


In [23]:
from surprise import Dataset
from surprise.model_selection import GridSearchCV


param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],'reg_all': [0.4, 0.6]}
# grid search on KNNBasic
gs_basic = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)
gs_basic.fit(tot_data2)

# grid search on KNNMeans
gs_means = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=3)
gs_means.fit(tot_data2)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [24]:
# best RMSE score
print(gs_basic.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_basic.best_params['rmse'])

# best RMSE score
print(gs_means.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_means.best_params['rmse'])

0.5945464411137332
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
0.6379752233346644
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}


###### Evaluation on the Deep Learning model

In [1]:
import keras
from keras.layers import Embedding, Reshape,Concatenate,Add
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import dot
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
import pandas as pd
from keras.utils import plot_model
ratings = pd.read_csv("new_sample_data.csv", usecols = ["user_id", "restaurant_id","Rating_x"]) 
len(ratings.user_id.unique()), len(ratings.restaurant_id.unique())
ratings["Rating_x"].fillna("Nan")

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
2779    5.0
2780    5.0
2781    5.0
2782    5.0
2783    5.0
Name: Rating_x, Length: 2784, dtype: float64

In [3]:
ratings.user_id = ratings.user_id.astype('category').cat.codes.values
ratings.restaurant_id = ratings.restaurant_id.astype('category').cat.codes.values

In [4]:
train, test = train_test_split(ratings, test_size=0.2)

In [5]:
train.head()

Unnamed: 0,user_id,restaurant_id,Rating_x
664,16,3,3.0
1899,13,0,3.0
1525,9,0,2.5
953,4,3,5.0
483,7,3,1.0


In [6]:
n_users, n_res = len(ratings.user_id.unique()), len(ratings.restaurant_id.unique())

In [7]:
from keras.layers import *
from keras.models import Model
from keras.layers import concatenate

In [8]:
def neural_network_model(n_latent_factors_user, n_latent_factors_res):
    
    res_input = keras.layers.Input(shape=[1],name='Item')
    res_embedding = keras.layers.Embedding(n_res + 1, n_latent_factors_res, name='res-Embedding')(res_input)
    res_vec = keras.layers.Flatten(name='FlattenMovies')(res_embedding)
    res_vec = keras.layers.Dropout(0.2)(res_vec)


    user_input = keras.layers.Input(shape=[1],name='User')
    user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding')(user_input))
    user_vec = keras.layers.Dropout(0.2)(user_vec)

    concat =keras.layers.merge.concatenate([res_vec, user_vec],name='Concat')
    concat_dropout = keras.layers.Dropout(0.2)(concat)
    dense = keras.layers.Dense(100,name='FullyConnected')(concat)
    dropout_1 = keras.layers.Dropout(0.2,name='Dropout')(dense)
    dense_2 = keras.layers.Dense(50,name='FullyConnected-1')(concat)
    dropout_2 = keras.layers.Dropout(0.2,name='Dropout')(dense_2)
    dense_3 = keras.layers.Dense(20,name='FullyConnected-2')(dense_2)
    dropout_3 = keras.layers.Dropout(0.2,name='Dropout')(dense_3)
    dense_4 = keras.layers.Dense(10,name='FullyConnected-3', activation='relu')(dense_3)


    result = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)
    adam = Adam(lr=0.005)
    model = keras.Model([user_input, res_input], result)
    model.compile(optimizer=adam,loss= 'mean_absolute_error', metrics = ["MAE"])
    return model

In [9]:
model3 = neural_network_model(100,180)

In [10]:
history_neural_network = model3.fit([train.user_id, train.restaurant_id], train.Rating_x, epochs=50, verbose=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
import numpy as np
y_hat = np.round(model3.predict([test.user_id, test.restaurant_id]),0)
y_true = test.Rating_x

In [12]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_hat)

0.3123877917414722

In [13]:
y_true

1874    3.0
1625    2.5
704     3.5
472     1.0
1152    0.5
       ... 
1204    0.5
561     1.5
2658    5.0
529     1.5
1427    2.0
Name: Rating_x, Length: 557, dtype: float64

In [14]:
res_embedding_learnt = model3.get_layer(name='res-Embedding').get_weights()[0]
pd.DataFrame(res_embedding_learnt).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,170,171,172,173,174,175,176,177,178,179
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,-0.070646,0.015588,0.089832,-0.020254,0.021213,0.082819,-0.054955,-0.029187,0.08796,0.092193,...,0.049175,-0.110811,-0.038737,0.025809,-0.027601,-0.031487,0.025028,-0.043475,0.041744,-0.030726
std,0.164668,0.20381,0.190705,0.047496,0.188593,0.205018,0.142392,0.136575,0.13749,0.208349,...,0.101353,0.187064,0.15029,0.11639,0.069215,0.138285,0.08788,0.187181,0.188469,0.132935
min,-0.43055,-0.264661,-0.269351,-0.145226,-0.30978,-0.215572,-0.335304,-0.258213,-0.126276,-0.431518,...,-0.107516,-0.436859,-0.311657,-0.148603,-0.155468,-0.22565,-0.192433,-0.371817,-0.280478,-0.210246
25%,-0.195339,-0.10752,-0.041191,-0.041376,-0.104306,-0.058437,-0.11494,-0.122483,-0.01502,-0.005182,...,2.6e-05,-0.276571,-0.149898,-0.031763,-0.071355,-0.145531,-0.004775,-0.143518,-0.071294,-0.118005
50%,-0.047345,-0.032297,0.085736,-0.002466,0.025302,0.029375,-0.064496,-0.004184,0.094669,0.067336,...,0.03863,-0.031683,-0.028445,0.032769,-0.006123,-0.049229,0.033142,-0.028943,0.013363,-0.029543
75%,0.05854,0.122887,0.218182,0.010875,0.180384,0.240458,0.023485,0.029077,0.1896,0.227609,...,0.102704,0.032084,0.0472,0.085524,0.024381,0.056109,0.066893,0.065764,0.191721,0.043151
max,0.146543,0.541225,0.459268,0.042616,0.370844,0.387585,0.211002,0.244285,0.332157,0.454277,...,0.254974,0.105383,0.261225,0.239421,0.056178,0.200605,0.162997,0.26318,0.418284,0.33987


In [15]:
user_embedding_learnt = model3.get_layer(name='User-Embedding').get_weights()[0]
pd.DataFrame(user_embedding_learnt).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,-0.012802,0.022213,0.018498,0.038347,-0.011903,0.016431,0.004669,0.000797,0.065129,-0.040186,...,-0.016344,-0.003926,0.027122,0.05599,0.003543,-0.070781,0.030242,0.056383,0.001433,-0.037341
std,0.185706,0.21232,0.17493,0.239397,0.228618,0.171553,0.173927,0.176298,0.198083,0.186887,...,0.198851,0.230565,0.193753,0.204558,0.204432,0.248487,0.276421,0.218101,0.174091,0.215863
min,-0.405901,-0.407112,-0.35126,-0.371004,-0.364743,-0.410343,-0.304619,-0.264183,-0.255757,-0.383426,...,-0.498326,-0.503703,-0.356709,-0.369798,-0.330918,-0.59803,-0.572505,-0.399862,-0.366219,-0.50191
25%,-0.0724,-0.08342,-0.037893,-0.14907,-0.227892,-0.057912,-0.093918,-0.110325,-0.046992,-0.165503,...,-0.049334,-0.121536,-0.05009,-0.078216,-0.092578,-0.21156,-0.108219,-0.082695,-0.050609,-0.090302
50%,0.013285,0.013817,0.000725,0.004223,-0.01753,0.015742,-0.019642,0.001882,0.029642,-0.044932,...,0.0092,0.034538,0.011833,0.04027,-0.008002,-0.037247,0.032052,-0.000568,0.007641,-0.05361
75%,0.095411,0.145176,0.170652,0.122836,0.096061,0.091461,0.062696,0.043687,0.108068,0.044488,...,0.091463,0.098669,0.169071,0.221603,0.052571,0.07802,0.194478,0.135371,0.102613,0.035618
max,0.293341,0.363233,0.26612,0.490825,0.442539,0.369016,0.353354,0.474306,0.484598,0.342454,...,0.289626,0.367689,0.363169,0.441197,0.463331,0.34417,0.593283,0.489565,0.239579,0.448213


In [16]:
Restaurant_list = pd.read_csv("restaurants.csv")
Restaurant_list = Restaurant_list[["resId","title"]]
Restaurant_list.head()
#choose a user ID
user_id = 0
#get movies rated by this user id
users_res = ratings.loc[ratings["user_id"]==user_id]
#print how many ratings user has made 
print("User ID : " + str(user_id) + " has already rated " + str(len(users_res)) + " res")
#list movies that have been rated
pd.merge(users_res,Restaurant_list,left_on = "restaurant_id", right_on = "resId")

User ID : 0 has already rated 124 res


Unnamed: 0,user_id,restaurant_id,Rating_x,resId,title
0,0,5,0.0,5,Pizza Pzazz
1,0,5,0.0,5,Pizza Pzazz
2,0,5,0.0,5,Pizza Pzazz
3,0,5,0.0,5,Pizza Pzazz
4,0,5,0.0,5,Pizza Pzazz
...,...,...,...,...,...
119,0,3,1.5,3,Didi's Greek
120,0,3,1.5,3,Didi's Greek
121,0,3,1.5,3,Didi's Greek
122,0,3,1.5,3,Didi's Greek


In [17]:
mf_pred = pd.DataFrame(user_embedding_learnt)
mf_pred.head()
user_index = train.loc[train["user_id"]==user_id]['user_id'][:1].values[0]
#print(user_index)
#get movie ratings predicted for this user and sort by highest rating prediction
sorted_user_predictions = pd.DataFrame(mf_pred.iloc[user_index].sort_values(ascending=False))
#rename the columns
sorted_user_predictions.columns=['Ratings']
#save the index values as res id
sorted_user_predictions['Restaurant_id']=sorted_user_predictions.index
print("Top 10 predictions for User " + str(user_id))
#display the top 10 predictions for this user
#print(sorted_user_predictions)
topn3 = pd.merge(sorted_user_predictions,Restaurant_list,left_on="Restaurant_id", right_on = "resId")[:10]
#df_names = pd.merge(ratings,Restaurant_list,left_on='Restaurant_id',right_on='resId')
topn3

Top 10 predictions for User 0


Unnamed: 0,Ratings,Restaurant_id,resId,title
0,0.4754,35,35,Viet Sub
1,0.440175,86,86,Sun Yee Cafe
2,0.396927,34,34,Tayybeh
3,0.375394,12,12,The Lemon Square
4,0.337384,10,10,Domino's Pizza
5,0.332815,45,45,Takis Taverna
6,0.325336,75,75,Kamei Royal
7,0.312246,6,6,Cactus Club Cafe
8,0.299674,23,23,Truong Thanh Vietnamese Restaurant
9,0.294317,65,65,Indian Delicacy


In [18]:
mf_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.348315,-0.18617,-0.347851,-0.179136,0.078442,-0.161684,0.312246,0.151119,-0.053012,0.129749,...,-0.29946,0.065613,-0.28598,-0.012746,0.031147,0.100736,-0.108219,-0.107366,-0.28266,0.137227
1,-0.169769,-0.090411,-0.35126,-0.137184,-0.036906,0.086217,0.353354,0.01452,0.108068,-0.260935,...,0.038147,0.367689,-0.089343,0.084327,-0.330918,-0.59803,-0.152331,0.091033,0.102064,-0.079911
2,-0.030965,-0.050982,-0.037893,0.018136,0.001626,-0.003923,0.056139,0.001882,-0.046992,0.013678,...,-0.062513,0.040376,-0.012191,0.04027,6.9e-05,0.009438,0.032052,-0.012965,-0.012461,0.027925
3,0.013285,-0.020615,0.180415,-0.371004,0.418155,-0.052517,-0.071278,-0.110325,-0.255757,0.191341,...,0.091463,-0.223967,0.053962,-0.369798,-0.084856,0.277954,-0.361596,-0.399862,0.044384,0.06323
4,-0.405901,-0.231913,-0.232042,0.122836,-0.070353,-0.149428,0.242661,0.298739,0.068657,0.015644,...,-0.498326,-0.121536,-0.356709,0.100465,0.433454,0.07802,0.194478,0.135371,-0.366219,0.035618


In [19]:
from sklearn.model_selection import StratifiedKFold
import numpy
seed = 7
numpy.random.seed(seed)
#dataset = numpy.loadtxt("res3.csv", delimiter=",")
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
scores = model3.evaluate([test.user_id, test.restaurant_id], test.Rating_x, verbose=0)
cvscores.append(scores*100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

0.23% (+/- 0.00%)


In [20]:
from keras.constraints import non_neg
def matrix_factorisation_model_with_n_latent_factors_and_non_negative_embedding(n_latent_factors) :
    res_input = keras.layers.Input(shape=[1],name='Item')
    res_embedding = keras.layers.Embedding(n_res + 1, n_latent_factors, name='Non-Negative-res-Embedding',embeddings_constraint=non_neg())(res_input)
    res_vec = keras.layers.Flatten(name='Flattenres')(res_embedding)

    user_input = keras.layers.Input(shape=[1],name='User')
    user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, n_latent_factors,name='Non-Negative-User-Embedding',embeddings_constraint=non_neg())(user_input))
    prod =dot([res_vec, user_vec], axes=1, normalize=False,name='DotProduct')
    
    model = keras.Model([user_input, res_input], prod)
    #print(model)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [21]:
model2 = matrix_factorisation_model_with_n_latent_factors_and_non_negative_embedding(10)

In [22]:
history_nonneg = model2.fit([train.user_id, train.restaurant_id], train.Rating_x, epochs=50, verbose=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [23]:
y_hat = np.round(model2.predict([test.user_id, test.restaurant_id]),0)
y_true = test.Rating_x

In [24]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_hat)

0.3985637342908438

In [25]:
res_embedding_learnt_2 = model2.get_layer(name = "Non-Negative-res-Embedding").get_weights()[0]
#pd.DataFrame(res_embedding_learnt_2).describe()
res_embedding_learnt_2

array([[ 0.42904505,  0.60288525,  0.60749084,  0.19550571, -0.        ,
         0.92121977,  1.2067546 ,  0.40468898,  0.59408426,  0.1146347 ],
       [ 0.78364193,  0.80105406,  1.0226455 ,  0.91428983,  0.9486882 ,
         0.9810385 ,  0.7062515 ,  0.8040179 ,  0.7992881 ,  0.83794147],
       [ 0.40131703,  0.1944141 ,  0.5690541 ,  0.38655972,  0.4845404 ,
         0.28249827,  0.30311587,  0.5294493 ,  0.2924208 ,  0.4917919 ],
       [ 0.81704545,  0.6809631 ,  0.6520003 ,  0.96473217,  1.0089903 ,
         0.3861151 ,  0.08740885,  0.732016  ,  0.6051296 ,  0.95359075],
       [ 0.65199614,  0.62938935,  0.04416304,  0.51247776,  0.4130313 ,
         0.26778018,  0.6423992 ,  0.65347284,  0.69633746,  0.6420616 ],
       [ 0.61496997,  0.666567  ,  0.63168806,  0.54499733,  0.49403125,
         0.7108486 ,  0.9318057 ,  0.75091547,  0.6545116 ,  0.5853988 ],
       [ 0.535559  ,  0.48753458,  0.4877365 ,  0.5631468 ,  0.5214618 ,
         0.5350216 ,  0.4726816 ,  0.55078626

In [26]:
user_embedding_learnt_2 = model2.get_layer(name = "Non-Negative-User-Embedding").get_weights()[0]
#pd.DataFrame(user_embedding_learnt_2).describe()
user_embedding_learnt_2

array([[ 2.00233117e-01,  1.46537691e-01,  1.34536132e-01,
         2.25316420e-01,  2.16447413e-01,  8.59405994e-02,
        -0.00000000e+00,  1.01087481e-01,  1.57355160e-01,
         2.24003196e-01],
       [ 1.36115521e-01,  7.12857291e-05,  4.77748245e-01,
         7.04955459e-02,  1.64236754e-01,  1.95743382e-01,
         1.43879265e-01,  1.59282535e-01,  7.62967742e-04,
         8.85925293e-02],
       [ 2.91345060e-01,  3.62605870e-01,  3.39002073e-01,
         1.94371536e-01,  1.34829447e-01,  4.48413491e-01,
         5.36631107e-01,  2.22414955e-01,  3.35386038e-01,
         1.35907918e-01],
       [ 6.45203769e-01,  6.03056967e-01,  8.99741799e-02,
         4.82692033e-01,  4.02062863e-01,  2.57292688e-01,
         6.67648256e-01,  6.37483120e-01,  6.65460050e-01,
         6.17404103e-01],
       [ 3.46900284e-01,  2.28999689e-01,  2.19798833e-01,
         5.17127812e-01,  6.46185815e-01, -0.00000000e+00,
        -0.00000000e+00,  3.43479067e-01,  1.90112531e-01,
         5.

In [27]:
mf_pred_2 = pd.DataFrame(user_embedding_learnt_2)
mf_pred_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.200233,0.146538,0.134536,0.225316,0.216447,0.085941,-0.0,0.101087,0.157355,0.224003
1,0.136116,7.1e-05,0.477748,0.070496,0.164237,0.195743,0.143879,0.159283,0.000763,0.088593
2,0.291345,0.362606,0.339002,0.194372,0.134829,0.448413,0.536631,0.222415,0.335386,0.135908
3,0.645204,0.603057,0.089974,0.482692,0.402063,0.257293,0.667648,0.637483,0.66546,0.617404
4,0.3469,0.229,0.219799,0.517128,0.646186,-0.0,-0.0,0.343479,0.190113,0.560905


In [28]:
user_index = train.loc[train["user_id"]==user_id]['user_id'][:1].values[0]
#print(user_index)
#get movie ratings predicted for this user and sort by highest rating prediction
sorted_user_predictions = pd.DataFrame(mf_pred_2.iloc[user_index].sort_values(ascending=False))
#rename the columns
print(sorted_user_predictions)
sorted_user_predictions.columns=['Ratings']
#save the index values as res id
sorted_user_predictions['Restaurant_id']=sorted_user_predictions.index
print("Top 10 predictions for User " + str(user_id))
#display the top 10 predictions for this user
#print(sorted_user_predictions)
topn2 = pd.merge(sorted_user_predictions,Restaurant_list,left_on='Restaurant_id' ,right_on = "resId")[:10]
#df_names = pd.merge(ratings,Restaurant_list,left_on='Restaurant_id',right_on='resId')
topn2

          0
3  0.225316
9  0.224003
4  0.216447
0  0.200233
8  0.157355
1  0.146538
2  0.134536
7  0.101087
5  0.085941
6 -0.000000
Top 10 predictions for User 0


Unnamed: 0,Ratings,Restaurant_id,resId,title
0,0.225316,3,3,Didi's Greek
1,0.224003,9,9,J Crepe
2,0.216447,4,4,Juliet's Cafe and Catering
3,0.200233,0,0,La Taqueria Pinche Taco Shop
4,0.157355,8,8,HY Tea Lounge
5,0.146538,1,1,Sirloiner Restaurants
6,0.134536,2,2,Pizza Garden
7,0.101087,7,7,Tim Hortons
8,0.085941,5,5,Pizza Pzazz
9,-0.0,6,6,Cactus Club Cafe


In [29]:
from sklearn.model_selection import StratifiedKFold
import numpy
seed = 7
numpy.random.seed(seed)
#dataset = numpy.loadtxt("res3.csv", delimiter=",")
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
scores = model2.evaluate([test.user_id, test.restaurant_id], test.Rating_x, verbose=0)
cvscores.append(scores*100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

47.40% (+/- 0.00%)


In [30]:
def matrix_factorisation_model_with_n_latent_factors(n_latent_factors) :
    res_input = keras.layers.Input(shape=[1],name='Item')
    res_embedding = keras.layers.Embedding(n_res + 1, n_latent_factors, name='res-Embedding')(res_input)
    res_vec = keras.layers.Flatten(name='Flattenres')(res_embedding)
    user_input = keras.layers.Input(shape=[1],name='User')
    user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, n_latent_factors,name='User-Embedding')(user_input))
    prod =dot([res_vec, user_vec], axes=1, normalize=False,name='DotProduct')
    model = keras.Model([user_input, res_input], prod)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [31]:
model = matrix_factorisation_model_with_n_latent_factors(10)

In [32]:
history = model.fit([train.user_id, train.restaurant_id], train.Rating_x, epochs=50, verbose=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [33]:
y_hat = np.round(model.predict([test.user_id, test.restaurant_id]),0)
y_true = test.Rating_x
mean_absolute_error(y_true, y_hat)

0.3267504488330341

In [34]:
res_embedding_learnt_1 = model.get_layer(name='res-Embedding').get_weights()[0]
pd.DataFrame(res_embedding_learnt_1).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,0.601827,-0.482782,-0.16731,-0.314411,-0.355624,0.401221,0.469343,0.645775,0.370054,-0.424186
std,0.252989,0.403587,0.503819,0.426502,0.395817,0.485357,0.387299,0.233176,0.428513,0.380991
min,0.009834,-1.04326,-0.963401,-0.90972,-0.799795,-0.825953,-0.579352,-0.034735,-0.530604,-0.871073
25%,0.559159,-0.679969,-0.639738,-0.657783,-0.694993,0.227881,0.310157,0.607263,0.230423,-0.656695
50%,0.65288,-0.580809,-0.137749,-0.382376,-0.398474,0.539919,0.575566,0.725094,0.370833,-0.539759
75%,0.753922,-0.400651,0.213183,-0.007368,-0.170336,0.617752,0.744624,0.784815,0.725514,-0.37075
max,0.876081,0.759676,0.595024,0.507644,0.511217,1.162595,0.927368,0.876222,0.846723,0.567127


In [35]:
user_embedding_learnt_1 = model.get_layer(name = "User-Embedding").get_weights()[0]
pd.DataFrame(user_embedding_learnt_1).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,0.555291,-0.436316,-0.149151,-0.268522,-0.388578,0.376147,0.357141,0.564912,0.450059,-0.328028
std,0.240238,0.399602,0.605382,0.476816,0.398806,0.4322,0.433882,0.254172,0.356962,0.433816
min,-0.020152,-0.885736,-0.927681,-0.944138,-0.902585,-0.768972,-0.744758,-0.031463,-0.18472,-0.908418
25%,0.404914,-0.702999,-0.695178,-0.710761,-0.769094,0.149004,0.174872,0.409925,0.233139,-0.672084
50%,0.581045,-0.606599,-0.354156,-0.386687,-0.423861,0.549985,0.482521,0.65671,0.445294,-0.419941
75%,0.729627,-0.220426,0.405914,0.212904,-0.1989,0.695691,0.684384,0.757553,0.753611,-0.167975
max,0.862783,0.674383,0.968941,0.501241,0.690117,0.84264,0.950562,0.940655,1.026828,0.786549


In [36]:
mf_pred_1 = pd.DataFrame(user_embedding_learnt_1)
mf_pred_1.head()
user_index = train.loc[train["user_id"]==user_id]['user_id'][:1].values[0]
#print(user_index)
#get movie ratings predicted for this user and sort by highest rating prediction
sorted_user_predictions = pd.DataFrame(mf_pred_1.iloc[user_index].sort_values(ascending=False))
#rename the columns
sorted_user_predictions.columns=['Ratings']
#save the index values as res id
sorted_user_predictions['Restaurant_id']=sorted_user_predictions.index
print("Top 10 predictions for User " + str(user_id))
#display the top 10 predictions for this user
#print(sorted_user_predictions)
topn = pd.merge(sorted_user_predictions,Restaurant_list,left_on='Restaurant_id' ,right_on = "resId")[:10]
#df_names = pd.merge(ratings,Restaurant_list,left_on='Restaurant_id',right_on='resId')
topn

Top 10 predictions for User 0


Unnamed: 0,Ratings,Restaurant_id,resId,title
0,0.501241,3,3,Didi's Greek
1,0.482833,2,2,Pizza Garden
2,0.233139,8,8,HY Tea Lounge
3,0.174872,6,6,Cactus Club Cafe
4,0.103543,5,5,Pizza Pzazz
5,0.102309,7,7,Tim Hortons
6,0.083717,0,0,La Taqueria Pinche Taco Shop
7,-0.104813,4,4,Juliet's Cafe and Catering
8,-0.148539,1,1,Sirloiner Restaurants
9,-0.215959,9,9,J Crepe


In [37]:
from sklearn.model_selection import StratifiedKFold
import numpy
seed = 7
numpy.random.seed(seed)
#dataset = numpy.loadtxt("res3.csv", delimiter=",")
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
scores = model.evaluate([test.user_id, test.restaurant_id], test.Rating_x, verbose=0)
cvscores.append(scores*100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

34.87% (+/- 0.00%)


In [53]:
details = pd.read_csv("new_sample_data.csv", usecols = ["user_id", "restaurant_id","title", "Details", "Tags"])
s_df = details[details["user_id"] == 0]
ll = []
ll = list(s_df["title"].unique())
for i in ll:
    print(i)

Chartreuse Moose Cappuccino Bar & Bistro
Jake's
A&W Restaurant
P Bass Fish and Chips


In [57]:
pd.set_option("display.max_colwidth", 300)
s_df[s_df["title"] == "Jake's"]["Details"]

327    {'CUISINES': 'Bar, Pub, Canadian', 'Meals': 'Dinner', 'FEATURES': 'Seating, Television, Wheelchair Accessible, Table Service'}
Name: Details, dtype: object

In [62]:
pd.set_option("display.max_colwidth", 300)
s_df[s_df["title"] == "A&W Restaurant"]["Details"]

395                                                                                                                     {'CUISINES': 'Canadian'}
396                                           {'CUISINES': 'Fast food, Canadian', 'Meals': 'Lunch, Dinner', 'FEATURES': 'Wheelchair Accessible'}
397    {'CUISINES': 'American, Fast food, Canadian', 'Meals': 'Lunch, Dinner, Breakfast', 'FEATURES': 'Takeout, Seating, Wheelchair Accessible'}
398                                 {'CUISINES': 'Fast food, Canadian', 'Meals': 'Lunch', 'FEATURES': 'Takeout, Seating, Wheelchair Accessible'}
399                                 {'CUISINES': 'Fast food, Canadian', 'Meals': 'Lunch', 'FEATURES': 'Takeout, Seating, Wheelchair Accessible'}
                                                                         ...                                                                    
564                                           {'CUISINES': 'American, Fast food', 'Meals': 'Lunch, Dinner', 'FEATURES': 'Wheelchai