In [2]:
##### IMPORT LIBRARIES 

# Main library 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm # Track progress 
from sklearn.metrics import mean_squared_error

# Similarity metric
from sklearn.metrics.pairwise import cosine_similarity 

# Recommender systems
import surprise as sp
from surprise import Reader, Dataset, SVD, evaluate, accuracy  
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.sparse import vstack # To stack sparse matrices
import scipy.sparse as sp

# To light fm
# from lightfm import LightFM
# from lightfm.evaluation import precision_at_k

Using TensorFlow backend.


In [3]:
##### DATASETS 

# Normal train and test set (filtered and split)
df_train= pd.read_csv('../../data/generated/inter_train.csv')
df_train = df_train.dropna()
df_test= pd.read_csv('../../data/generated/inter_test.csv') 
df_test = df_test.dropna()


# Read as matrix format of train and test set
X_train = pd.read_csv('../../data/generated/inter_train.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)
X_test = pd.read_csv('../../data/generated/inter_test.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)

# Fill in missing values with -1 and add 1 to each rating (want NaN to 0)
train = X_train.fillna(-1)
train = train.add(1)
train2 = train.T  # make us save a few seconds in similarity computation 


# Full data - no train/test split 
full_data = pd.read_csv('../../data/generated/full_data_filtered.csv')
full_data_matrix = full_data.pivot_table(index='u', columns='i', values='rating', dropna=False)


# Create a dataframe giving the recipe's name along with its id 'i'
names = pd.read_csv('../../data/generated/names.csv')
names.columns=['i','name']
# names = names.set_index('i') # set 'i' as index, useful later. 

## Matrix facto using keras 

In [76]:
# TRAIN / TEST SET 


# Create user- & recipe-id mapping
df_filter= pd.read_csv('../../data/generated/inter_train.csv')
user_id_mapping = {id:i for i, id in enumerate(df_filter['u'].unique())}
recipe_id_mapping = {id:i for i, id in enumerate(df_filter['i'].unique())}


# Create correctly mapped train & testset
train_user_data = df_train['u'].map(user_id_mapping)
train_recipe_data = df_train['i'].map(recipe_id_mapping)

test_user_data = df_test['u'].map(user_id_mapping)
test_recipe_data = df_test['i'].map(recipe_id_mapping)

full_user_data = full_data['u'].map(user_id_mapping)
full_recipe_data = full_data['i'].map(recipe_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)  #10007
recipes = len(recipe_id_mapping) #11120
embedding_size = 20 # equivalent to number of latent factors 


In [77]:
##### Create model
# use keras (refer to documentation)

# Set input layers. We input respectively 10007 and 11120 items of size 1 (u and i) 
user_id_input = Input(shape=[1], name='user')  # shape (None,1)
recipe_id_input = Input(shape=[1], name='recipe') # shape (None,1)

# Create embedding layers for users and recipes
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)  # shape (None, 1, embedding_size)
recipe_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=recipes,
                            input_length=1, 
                            name='item_embedding')(recipe_id_input) # shape (None, 1, embedding_size)

# Reshape the embedding layers (- like flattening)
user_vector = Reshape([embedding_size])(user_embedding) # shape (None, embedding_size)
recipe_vector = Reshape([embedding_size])(recipe_embedding) # shape (None, embedding_size)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, recipe_vector])  # shape (None,1)

# Instantiate a model given input and output layers. 
model = Model(inputs=[user_id_input, recipe_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')

In [78]:
# Fit model
model.fit([train_user_data, train_recipe_data],
          df_train['rating'],
          batch_size=400, 
          epochs=24,
          validation_data = ([test_user_data, test_recipe_data],df_test['rating']),
          shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 225338 samples, validate on 55925 samples
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


<keras.callbacks.callbacks.History at 0x1a80ba6208>

In [89]:
### Investigate predictions on test set (unseen data)
# How our model generalises

# Compute predictions and compare them to the true value
y_pred2 = model.predict([test_user_data, test_recipe_data])

# Add names
# df_test = df_test.merge(names,on='i')
y_pred2 = pd.DataFrame(y_pred2)
df_test_copy = df_test.reset_index(drop=True)
y_pred2 = y_pred2.join(df_test_copy)
y_pred2 = y_pred2.rename(columns={0:'pred_rating'})

# Look at a user in particular
y_pred2 = y_pred2.drop(['Unnamed: 0'], axis=1)
y_pred2[y_pred2['u']==87]

Unnamed: 0,pred_rating,rating,u,i
1877,4.317087,4.0,87,78551
1878,4.284936,5.0,87,142774
1879,4.834454,5.0,87,103884
1880,4.67774,3.0,87,149422
1881,4.655463,5.0,87,114081
1882,4.556254,3.0,87,148228
1883,4.478168,5.0,87,172148
1884,4.853331,5.0,87,112084
1885,4.649054,5.0,87,72975
1886,4.794844,5.0,87,115658


In [208]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model.predict([lst, ur])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]

Unnamed: 0,score,i,name
4512,5.767901,70003,blasted chicken
10661,5.580947,170534,chocolate toffee squares
1642,5.52352,24672,delicious bourbon chicken glaze
3560,5.481469,54875,soutzoukakia greek meatballs in a tomato sauce
3786,5.465487,58488,moroccan spaghetti very low fat and healthy
7689,5.457678,122130,art s roast beef anderson indiana copycat
3064,5.44136,46762,vicki s apple coffee cake
7342,5.440838,115884,green beans to impress
9541,5.419204,151830,lentil soup with sausage
8273,5.41291,131439,delicious southern style deviled eggs


In [222]:
## Rerun model above while full data instead of a train/test split
# Only a few lines to modify in the code above


##### Create model
# use keras (refer to documentation)

# Set input layers. We input respectively 10007 and 11120 items of size 1 (u and i) 
user_id_input = Input(shape=[1], name='user')  # shape (None,1)
recipe_id_input = Input(shape=[1], name='recipe') # shape (None,1)

# Create embedding layers for users and recipes
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)  # shape (None, 1, embedding_size)
recipe_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=recipes,
                            input_length=1, 
                            name='item_embedding')(recipe_id_input) # shape (None, 1, embedding_size)

# Reshape the embedding layers (- like flattening)
user_vector = Reshape([embedding_size])(user_embedding) # shape (None, embedding_size)
recipe_vector = Reshape([embedding_size])(recipe_embedding) # shape (None, embedding_size)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, recipe_vector])  # shape (None,1)


# Instantiate a model given input and output layers. 
model_bis = Model(inputs=[user_id_input, recipe_id_input], outputs=y)
model_bis.compile(loss='mse', optimizer='adam')

# Fit model
model_bis.fit([full_user_data, full_recipe_data],
          full_data['rating'],
          batch_size=400, 
          epochs=8,
          validation_split = 0.1,
          shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 253136 samples, validate on 28127 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.callbacks.History at 0x1b001a5080>

In [220]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model.predict([lst, ur])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]

Unnamed: 0,score,i,name
3451,5.363759,53086,chicken teriyaki
8886,5.312301,141651,tortellini spinach salad with sesame dressing
9120,5.301273,145644,i can cook yan s velvet corn soup
5464,5.283924,85426,french onion burgers
3786,5.283672,58488,moroccan spaghetti very low fat and healthy
8912,5.272288,141969,canadian date squares
8514,5.265934,135832,rich and bold bloody mary
10685,5.259289,170802,bea s roasted red potatoes
8778,5.254735,139680,baked chicken drumsticks
3301,5.25456,50821,lemon muffins


## Replace dot product by a dense layer

In [210]:
###### Full data

embed_size_user = 20
embed_size_recipe = 20

# Create user- & recipe-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filter['u'].unique())}
recipe_id_mapping = {id:i for i, id in enumerate(df_filter['i'].unique())}

# Create correctly mapped train & testset
train_user_data = df_train['u'].map(user_id_mapping)
train_recipe_data = df_train['i'].map(recipe_id_mapping)

test_user_data = df_test['u'].map(user_id_mapping)
test_recipe_data = df_test['i'].map(recipe_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)  #10007
recipes = len(recipe_id_mapping) #11120

In [211]:
##### Create model
# use keras (refer to documentation)

# Set input layers. We input respectively 10007 and 11120 items of size 1 (u and i) 
user_id_input = Input(shape=[1], name='user')  # shape (None,1)
recipe_id_input = Input(shape=[1], name='recipe') # shape (None,1)

# Create embedding layers for users and recipes
user_embedding = Embedding(output_dim=embed_size_user, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)  # shape (None, 1, embedding_size)
recipe_embedding = Embedding(output_dim=embed_size_recipe, 
                            input_dim=recipes,
                            input_length=1, 
                            name='item_embedding')(recipe_id_input) # shape (None, 1, embedding_size)

# Reshape the embedding layers (- like flattening)
user_vector = Reshape([embed_size_user])(user_embedding) # shape (None, embedding_size)
recipe_vector = Reshape([embed_size_recipe])(recipe_embedding) # shape (None, embedding_size)


# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, recipe_vector])

# Combine with dense layers
dense = Dense(256)(concat)
y = Dense(1)(dense)


# Instantiate a model given input and output layers. 
model2 = Model(inputs=[user_id_input, recipe_id_input], outputs=y)
model2.compile(loss='mse', optimizer='adam')

# Fit model  
model2.fit([train_user_data, train_recipe_data],
          df_train['rating'],    # df_train
          batch_size= 500, 
          epochs=6,
          validation_data = ([test_user_data, test_recipe_data],df_test['rating']),
          shuffle=True)



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 225338 samples, validate on 55925 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x1a465c84e0>

In [None]:
### Investigate predictions on test set (unseen data)
# How our model generalises

# Compute predictions and compare them to the true value
y_pred2 = model.predict([test_user_data, test_recipe_data])

# Add names
# df_test = df_test.merge(names,on='i')
y_pred2 = pd.DataFrame(y_pred2)
df_test_copy = df_test.reset_index(drop=True)
y_pred2 = y_pred2.join(df_test_copy)
y_pred2 = y_pred2.rename(columns={0:'pred_rating'})

# Look at a user in particular
y_pred2 = y_pred2.drop(['Unnamed: 0'], axis=1)
y_pred2[y_pred2['u']==87]

In [215]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model2.predict([lst, ur])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]

Unnamed: 0,score,i,name
2292,5.358158,35020,cool as a cucumber cheese cucumber and chiv...
658,5.356962,9462,kittencal s greek couscous
1330,5.356533,19692,instant gingerbread coffee
3812,5.352034,58814,pumpkin pie made with tofu no milk or eggs
6356,5.349389,100343,peaches and cream cake
10170,5.347365,162686,chocolate chip peanut butter ball cookies
6932,5.345346,109205,chili s egg rolls lightened up
8886,5.338517,141651,tortellini spinach salad with sesame dressing
4512,5.334653,70003,blasted chicken
5464,5.329917,85426,french onion burgers


In [227]:
## Rerun model above while full data instead of a train/test split
# Only a few lines to modify in the code above

# Instantiate a model given input and output layers. 
model2_bis = Model(inputs=[user_id_input, recipe_id_input], outputs=y)
model2_bis.compile(loss='mse', optimizer='adam')

# Fit model  
model2_bis.fit([full_user_data, full_recipe_data],
          full_data['rating'],    # df_train
          batch_size= 500, 
          epochs=8,
          shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/8
Epoch 2/8
Epoch 3/8
 24000/281263 [=>............................] - ETA: 11s - loss: 0.5995

KeyboardInterrupt: 

In [228]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model2.predict([lst, ur])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]

Unnamed: 0,score,i,name
3451,5.335119,53086,chicken teriyaki
9120,5.251676,145644,i can cook yan s velvet corn soup
8912,5.233258,141969,canadian date squares
8886,5.23232,141651,tortellini spinach salad with sesame dressing
10015,5.231939,159543,sour cream chive bread bread machine
2681,5.227715,40667,fruity grilled cheese sandwich
5464,5.224542,85426,french onion burgers
3786,5.214517,58488,moroccan spaghetti very low fat and healthy
8514,5.209369,135832,rich and bold bloody mary
7109,5.20855,112115,camping meal in one packages


# SVD using Surprise library 

In [75]:
# Parse file containing ratings in the specific format (u,i,r) 
reader = Reader(rating_scale=(0, 5)) 

# Returns a dataset - input file contains ratings only
data = Dataset.load_from_df(df_train[['u', 'i', 'rating']], reader)  
# data = Dataset.load_from_df(full_data[['u', 'i', 'rating']], reader)

# Use the SVD algorithm - optimised param
svd = SVD(n_factors=10, n_epochs =20, biased=True, lr_all=0.005, reg_all=0.05)

# Evaluate performance - 2 very similar ways - we obtain great results and can then train the model 
evaluate(svd, data, measures=['RMSE', 'MAE'])
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8347
MAE:  0.4829
------------
Fold 2
RMSE: 0.8335
MAE:  0.4824
------------
Fold 3
RMSE: 0.8377
MAE:  0.4833
------------
Fold 4
RMSE: 0.8462
MAE:  0.4873
------------
Fold 5
RMSE: 0.8389
MAE:  0.4840
------------
------------
Mean RMSE: 0.8382
Mean MAE : 0.4840
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8347003555832817,
                             0.8335394965424772,
                             0.8377180199751019,
                             0.8462227901312087,
                             0.8389334940328221],
                            'mae': [0.48285541358160333,
                             0.4824336365226422,
                             0.48330804734828897,
                             0.48733130490065457,
                             0.4840436833664003]})

In [7]:
# Do not split the dataset into folds and just use the trainset as it is.
trainset = data.build_full_trainset() 

# Train the svd classifier 
svd.fit(trainset)
# svd.train(trainset)

# Evaluate
# evaluate(svd, data, measures=['RMSE', 'MAE'])
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8418  0.8437  0.8370  0.8283  0.8358  0.8373  0.0054  
MAE (testset)     0.4878  0.4882  0.4870  0.4819  0.4850  0.4860  0.0023  
Fit time          4.23    4.14    4.23    4.24    4.00    4.17    0.09    
Test time         0.90    0.67    0.70    0.69    0.85    0.76    0.09    


{'test_rmse': array([0.84181245, 0.84367297, 0.83698633, 0.82832628, 0.83582201]),
 'test_mae': array([0.48778039, 0.48821628, 0.48702706, 0.48187228, 0.48501283]),
 'fit_time': (4.225776195526123,
  4.142586708068848,
  4.230924129486084,
  4.2449939250946045,
  3.997856855392456),
 'test_time': (0.9006788730621338,
  0.6716551780700684,
  0.6973147392272949,
  0.6921169757843018,
  0.8490839004516602)}

In [237]:
# Evaluate performance on the testset
testset = trainset.build_testset()
predictions = svd.test(testset)

# Compute accuracy score 
accuracy.rmse(predictions, verbose=True)

RMSE: 0.8287


0.8287466864506491

In [55]:
df_train[df_train['u'] == 13]

Unnamed: 0.1,Unnamed: 0,rating,u,i
1500,95334,5.0,13,144909
1501,100489,5.0,13,66140
1502,206918,4.0,13,104684
1503,211262,4.0,13,22426
1504,218053,4.0,13,69972
1505,227099,5.0,13,112069
1506,286542,4.0,13,164320
1507,324376,5.0,13,41800
1508,347734,5.0,13,96766
1509,364899,5.0,13,155279


In [13]:
# Compute predictions for user 0 (train set) - compare to table above 
print(svd.predict(87, 946, r_ui=4)) # user 87, recipe 32541
print(svd.predict(87, 87717, r_ui=5)) # user 87, recipe 24204
print(svd.predict(87, 89385, r_ui=3)) # user 87, recipe 24204

user: 87         item: 946        r_ui = 4.00   est = 4.53   {'was_impossible': False}
user: 87         item: 87717      r_ui = 5.00   est = 4.78   {'was_impossible': False}
user: 87         item: 89385      r_ui = 3.00   est = 4.54   {'was_impossible': False}


In [18]:
df_test[df_test['u'] == 87]

Unnamed: 0.1,Unnamed: 0,rating,u,i
9387,174738,4.0,87,78551
9388,238734,5.0,87,142774
9389,268995,5.0,87,103884
9390,275263,3.0,87,149422
9391,281036,5.0,87,114081
9392,281094,3.0,87,148228
9393,292278,5.0,87,172148
9394,349418,5.0,87,112084
9395,367410,5.0,87,72975
9396,371050,5.0,87,115658


In [24]:
# Compute predictions for user 0 (test set) - compare to table above
print(svd.predict(87, 99509, r_ui=3)) # user 87, recipe 99509
print(svd.predict(87, 115658, r_ui=5)) # user 87, recipe 115658


user: 87         item: 99509      r_ui = 3.00   est = 4.38   {'was_impossible': False}
user: 87         item: 115658     r_ui = 5.00   est = 4.85   {'was_impossible': False}


In [96]:
# Check generalisation and accuracy of the predictions (using df_test)

"""
# Predict ratings of targeted new recipes 
print(svd.predict(0, 7)) # user 0, recipe 7 
print(svd.predict(5, 7))
"""

# Predict ratings of all recipes in the test set for a given user 
user_index= 87  # 84,170,25,19,87 are interesting

l=r=[]
l = df_test[df_test['u'] == user_index]['i'].tolist()
r = df_test[df_test['u'] == user_index]['rating'].tolist()

j=0
for j in range(len(l)):
    print(svd.predict(user_index, l[j], r_ui = r[j]))

user: 87         item: 78551      r_ui = 4.00   est = 4.50   {'was_impossible': False}
user: 87         item: 142774     r_ui = 5.00   est = 4.45   {'was_impossible': False}
user: 87         item: 103884     r_ui = 5.00   est = 4.82   {'was_impossible': False}
user: 87         item: 149422     r_ui = 3.00   est = 4.69   {'was_impossible': False}
user: 87         item: 114081     r_ui = 5.00   est = 4.71   {'was_impossible': False}
user: 87         item: 148228     r_ui = 3.00   est = 4.60   {'was_impossible': False}
user: 87         item: 172148     r_ui = 5.00   est = 4.46   {'was_impossible': False}
user: 87         item: 112084     r_ui = 5.00   est = 4.82   {'was_impossible': False}
user: 87         item: 72975      r_ui = 5.00   est = 4.66   {'was_impossible': False}
user: 87         item: 115658     r_ui = 5.00   est = 4.81   {'was_impossible': False}
user: 87         item: 99509      r_ui = 3.00   est = 4.49   {'was_impossible': False}


In [231]:
#### Print list of N-recommendations 


# Consider user 0 again
user_index = 87 

# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

# Set to 4 the rating of each user wlog, it is not used below 
testset= [[user_index, iid, 4] for iid in unrated_recipes]

# Compute predictions for every recipe not tried already by user 0  
predictions = svd.test(testset)
pred_ratings = np.array([pred.est for pred in predictions])

# Note that 724 ratings have a predicted value of 5.
import collections
collections.Counter(pred_ratings==5)

# Return 10 best recommendations for this user. (bias towards order in the dataset)
j=0
index_recipes=[]
predicted_rating=[]
while j < 10: 
    i_max = pred_ratings.argmax()
    iid = unrated_recipes[i_max]
    print('for user {0}, we recommend recipe {1} whose predicted ratings is {2}'.format(user_index, iid, pred_ratings[i_max]))
    index_recipes.append(iid)
    predicted_rating.append(pred_ratings[i_max])
    pred_ratings = np.delete(pred_ratings, i_max)
    j+=1
    
# Create dataframe of recommendations
d = {'i': index_recipes, 'predicted ratings': predicted_rating}
reco = pd.DataFrame(d)
reco['u']=user_index

# In this particular case, I could randomly sample recipes from all the ones having a predicted rating of 5 

for user 87, we recommend recipe 11933 whose predicted ratings is 5.0
for user 87, we recommend recipe 57778 whose predicted ratings is 5.0
for user 87, we recommend recipe 74189 whose predicted ratings is 5.0
for user 87, we recommend recipe 35297 whose predicted ratings is 4.998671447665432
for user 87, we recommend recipe 96746 whose predicted ratings is 4.994138387556054
for user 87, we recommend recipe 80222 whose predicted ratings is 4.9937240392867555
for user 87, we recommend recipe 148675 whose predicted ratings is 4.993216012800818
for user 87, we recommend recipe 58698 whose predicted ratings is 4.980812423252135
for user 87, we recommend recipe 136173 whose predicted ratings is 4.97959445756168
for user 87, we recommend recipe 37440 whose predicted ratings is 4.9730508982095305


In [232]:
reco.merge(names,on='i')

Unnamed: 0,i,predicted ratings,u,name
0,11933,5.0,87,peaches cream pie
1,57778,5.0,87,died and gone to heaven chocolate cake
2,74189,5.0,87,crunchy tossed salad
3,35297,4.998671,87,paradise mango lemonade
4,96746,4.994138,87,cole slaw with beans and bacon
5,80222,4.993724,87,bacon mushroom chicken
6,148675,4.993216,87,paula s el paso burgers
7,58698,4.980812,87,baked falafel balls
8,136173,4.979594,87,strawberry yogurt pancakes
9,37440,4.973051,87,cinnamon pumpkin banana bread


In [None]:
# Tune parameters (lambda, gamma)

param_grid = {'n_factors'=[5,10,20], 'n_epochs': [15], 'lr_all': [0.01], 'reg_all': [0.5]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=4)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

## Use the full matrix as training set (no test set)

In [233]:
# Parse file containing ratings in the specific format (u,i,r) 
reader = Reader(rating_scale=(0, 5)) 

# Returns a dataset - input file contains ratings only
f_data = Dataset.load_from_df(full_data[['u', 'i', 'rating']], reader)  

# Create train/test
# trainset, testset = train_test_split(f_data, test_size=.2)
trainset = f_data.build_full_trainset() 

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a807a4630>

In [100]:
# Predict ratings for all pairs (u, i) that are in the training set.
# predictions = svd.test(testset)
testset = trainset.build_testset()
predictions = algo.test(testset)

# Compute accuracy score 
accuracy.rmse(predictions, verbose=True)

# Evaluate
evaluate(algo, f_data, measures=['RMSE', 'MAE'])

# Requires too much computing power to compute predictions for all cells of the full matrix

# Predict ratings for all pairs (u, i) that are NOT in the training set.
# testset = trainset.build_anti_testset()
# predictions = svd.test(testset)

RMSE: 0.6437
Evaluating RMSE, MAE of algorithm SVD.





------------
Fold 1
RMSE: 0.8858
MAE:  0.5046
------------
Fold 2
RMSE: 0.8797
MAE:  0.5018
------------
Fold 3
RMSE: 0.8730
MAE:  0.5025
------------
Fold 4
RMSE: 0.8832
MAE:  0.5067
------------
Fold 5
RMSE: 0.8893
MAE:  0.5072
------------
------------
Mean RMSE: 0.8822
Mean MAE : 0.5046
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.885821260758604,
                             0.8796562348714713,
                             0.8730036212331056,
                             0.8831846656270435,
                             0.8892815641275748],
                            'mae': [0.5046333426212304,
                             0.5018213831745001,
                             0.5024743697274335,
                             0.5067071961372774,
                             0.5072378306704332]})

In [235]:
#### Print list of N-recommendations 


# Consider user 0 again
user_index = 87

# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

# Set to 4 the rating of each user wlog, it is not used below 
testset= [[user_index, iid, 4] for iid in unrated_recipes]

# Compute predictions for every recipe not tried already by user 0  
predictions = algo.test(testset)
pred_ratings = np.array([pred.est for pred in predictions])

# Note that numerous ratings have a predicted value of 5.
import collections
collections.Counter(pred_ratings==5)

# Return 10 best recommendations for this user. (bias towards order in the dataset)
j=0
index_recipes=[]
predicted_rating=[]
while j < 10: 
    i_max = pred_ratings.argmax()
    iid = unrated_recipes[i_max]
    print('for user {0}, we recommend recipe {1} whose predicted ratings is {2}'.format(user_index, iid, pred_ratings[i_max]))
    index_recipes.append(iid)
    predicted_rating.append(pred_ratings[i_max])
    pred_ratings = np.delete(pred_ratings, i_max)
    j+=1
    
# Create dataframe of recommendations
d = {'i': index_recipes, 'predicted ratings': predicted_rating}
reco = pd.DataFrame(d)
reco['u']=user_index
reco 

# In this particular case, I could randomly sample recipes from all the ones having a predicted rating of 5 

for user 87, we recommend recipe 3083 whose predicted ratings is 5.0
for user 87, we recommend recipe 3510 whose predicted ratings is 5.0
for user 87, we recommend recipe 6687 whose predicted ratings is 5.0
for user 87, we recommend recipe 9489 whose predicted ratings is 5.0
for user 87, we recommend recipe 10899 whose predicted ratings is 5.0
for user 87, we recommend recipe 15491 whose predicted ratings is 5.0
for user 87, we recommend recipe 15775 whose predicted ratings is 5.0
for user 87, we recommend recipe 17129 whose predicted ratings is 5.0
for user 87, we recommend recipe 17696 whose predicted ratings is 5.0
for user 87, we recommend recipe 17817 whose predicted ratings is 5.0


Unnamed: 0,i,predicted ratings,u
0,3083,5.0,87
1,3510,5.0,87
2,6687,5.0,87
3,9489,5.0,87
4,10899,5.0,87
5,15491,5.0,87
6,15775,5.0,87
7,17129,5.0,87
8,17696,5.0,87
9,17817,5.0,87


In [236]:
reco.merge(names,on='i')

Unnamed: 0,i,predicted ratings,u,name
0,3083,5.0,87,everyday french breakfast baguette and jam wi...
1,3510,5.0,87,good n easy macaroni salad
2,6687,5.0,87,ancho chile fries
3,9489,5.0,87,honey baked ham copycat
4,10899,5.0,87,olive garden pollo limone lemon chicken
5,15491,5.0,87,garlic roasted broccoli drizzled with balsamic...
6,15775,5.0,87,cake batter ice cream
7,17129,5.0,87,maple cinnamon coffee
8,17696,5.0,87,italian meatball hoagies
9,17817,5.0,87,thai fragrant vegetable soup


In [None]:
# Tune parameters (lambda, gamma)

param_grid = {'n_factors':[2, 10],'n_epochs': [15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=4)
gs.fit(f_data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])