In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms import KNNBasic, KNNWithMeans, SVD, NMF, SlopeOne, NormalPredictor
from surprise.model_selection import  GridSearchCV, ShuffleSplit

In [5]:
# Load the interactions
interactions = pd.read_csv("./food-com-recipes-and-user-interactions/RAW_interactions.csv", parse_dates=['date'], infer_datetime_format=True)
interactions.head()


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [6]:
#Load the recipes
recipes = pd.read_csv("./food-com-recipes-and-user-interactions/RAW_recipes.csv", parse_dates=['submitted'], infer_datetime_format=True)
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [19]:
# Create a data frame that only contains user id, recipe id and rating
ratings = interactions[["user_id","recipe_id","rating"]]

In [21]:
# Filter the ratings to users that have more than 10 ratings and less than 1000
# !rerun this line in case the filtering changes
# ratings = interactions

# Filter the out the 0 ratings
ratings = ratings[ratings.rating != 0]

#Get the number of ratings per user
ratings = ratings.groupby("user_id").filter(lambda x: (len(x) > 10) & (len(x) < 1000))
ratings = ratings.groupby("recipe_id").filter(lambda x: (len(x) > 10))
#Users left
print(f"Number of users: {ratings.user_id.unique().size}\n")
print(f"Number of recipes: {ratings.recipe_id.unique().size}\n")
#Movies left
#Fraction of user that is left
print(f"Fraction of users that is left: {round(ratings.user_id.unique().size/interactions.user_id.unique().size, 2)}\n")
#Fraction of recipes that is left (we already know that all recipes are in the interaction file)
print(f"Fraction of recipes that is left: {round(ratings.recipe_id.unique().size/interactions.recipe_id.unique().size, 2)}\n")

Number of users: 10895

Number of recipes: 8577

Fraction of users that is left: 0.05

Fraction of recipes that is left: 0.04



# Experimenting

In [69]:
# Create train-dev-test split:

ratings_train, ratings_test = train_test_split(ratings, test_size=0.3, random_state=42, shuffle=True, stratify=ratings.rating)

In [50]:
# Creating pivot table
ratings_train_p = ratings_train.pivot(index="recipe_id", columns="user_id", values="rating").fillna(0)
ratings_train_p.head()

user_id,1533,1535,1634,1676,1792,1891,1962,2178,2310,2312,...,2001297534,2001329932,2001330613,2001356926,2001359614,2001362355,2001410644,2001415211,2001436530,2001704911
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
#Create numpy representation of the data frame
ratings_train_p = np.array(ratings_train_p)
ratings_train_p

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
# Compute similarity
def compute_similarity(ratings, metric='correlation', type='user'):
    if type == 'user':
        # User Similarity Matrix
        correlation = 1 - pairwise_distances(ratings.T, metric='correlation', n_jobs=-1) #1 - pairwise distance to get the similarity!
        correlation[np.isnan(correlation)] = 0
        print('Shape of User Similarity Matrix:', correlation.shape)
    elif type == 'item':
        # Item Similarity Matrix
        correlation = 1 - pairwise_distances(ratings, metric='correlation', n_jobs=-1) #1 - pairwise distance to get the similarity!
        correlation[np.isnan(correlation)] = 0
        print('Shape of Item Similarity Matrix:', correlation.shape)
    return correlation

In [53]:
start_time = time.time()
# User Similarity Matrix
user_correlation = compute_similarity(ratings_train_p, type='user')
print("Runtime of user correlation: --- %s seconds ---" % (time.time() - start_time))
# Item Similarity Matrix
start_time = time.time()
item_correlation = compute_similarity(ratings_train_p, type='item')
print("Runtime of item correlation: --- %s seconds ---" % (time.time() - start_time))

Shape of User Similarity Matrix: (8037, 8037)
Runtime of user correlation: --- 110.37169122695923 seconds ---
Shape of Item Similarity Matrix: (5450, 5450)
Runtime of item correlation: --- 76.8810408115387 seconds ---


In [54]:
#the rating matrix is of shape |users|x|items|
def predict(ratings, similarity, type='user'): #ratings is the ratings matrix, similarity is the similarity matrix
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings ==> calcualting the mean per row by calcualting over all columns!
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        #Using the absolute in the denominator is not written in the formular? Why is it done?
        #Using np.array to create a third dimension as sum removes one dimension ==> thus we keep a 2d array
        #Division is carried out element wise and column wise ==> normalizing each element by the sum of similarity for the users
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        #Why is the function for item implemented like that? Why not transposing the rating matrix
        #and carrying out the same calculation?
        mean_item_rating = ratings.mean(axis=0)
        ratings_diff = (ratings - mean_item_rating[np.newaxis, :])
        pred = mean_item_rating[np.newaxis, :] + ratings_diff.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    elif type == 'content':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred.clip(min=0)

In [55]:
#Prediciting the actual ratings
user_prediction = predict(ratings_train_p.T, user_correlation, type='user')
item_prediction = predict(ratings_train_p.T, item_correlation, type='item')

In [57]:
user_pred_df = pd.DataFrame(user_prediction, index=list(ratings_train.user_id.sort_values().unique()), columns = list(ratings_train.recipe_id.sort_values().unique()))
item_pred_df = pd.DataFrame(item_prediction, index=list(ratings_train.user_id.sort_values().unique()), columns = list(ratings_train.recipe_id.sort_values().unique()))

In [58]:
user_pred_df.head()

Unnamed: 0,62,153,198,246,376,432,445,519,536,607,...,495275,495577,495967,496573,496591,497261,514423,518143,518145,518229
1533,0.0,0.009017,0.000951,0.016077,0.002655,0.017425,0.008058,0.010821,0.0,0.0,...,0.000302,0.000176,0.00114,0.001622,0.001811,0.001557,0.00143,0.001841,0.002026,0.001909
1535,0.128486,0.200609,0.128111,0.128446,0.130043,0.129912,0.1308,0.133141,0.132463,0.13144,...,0.122688,0.122502,0.123929,0.124642,0.124923,0.124547,0.124358,0.124966,0.125241,0.125067
1634,0.0,0.009994,0.0,0.032062,0.002379,0.005889,0.00328,0.010985,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1676,0.000867,0.001732,0.00348,0.003278,0.020084,0.0,0.0,0.0,0.0,0.009962,...,0.002644,0.002483,0.003722,0.004342,0.004586,0.004259,0.004096,0.004624,0.004863,0.004712
1792,0.0,0.0,0.0,0.009377,0.0,0.0,0.026468,0.0,0.0,0.025835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
item_pred_df.head()

Unnamed: 0,62,153,198,246,376,432,445,519,536,607,...,495275,495577,495967,496573,496591,497261,514423,518143,518145,518229
1533,0.000314,0.0,0.004915,0.011885,0.0,0.013553,0.003899,0.009053,0.001172,0.017279,...,0.01558,0.015247,0.012557,0.011194,0.010222,0.012413,0.011991,0.011922,0.011035,0.012223
1535,0.019037,1.498534,0.0,0.024015,0.069824,0.02731,0.04323,0.089509,0.05325,0.046252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.001353,0.003892,0.006356,0.01353,0.0,0.008225,0.004595,0.009637,0.001885,0.018792,...,0.017108,0.016735,0.014016,0.012633,0.011623,0.013917,0.013454,0.013438,0.012525,0.01377
1676,0.000993,0.0,0.005856,0.008337,0.001576,0.002009,0.000659,0.004928,0.006905,0.0203,...,0.016577,0.016218,0.013509,0.012134,0.011136,0.013395,0.012946,0.012912,0.012008,0.013233
1792,0.010189,0.0,0.012746,0.014608,0.0,0.009085,0.014854,0.006325,0.007282,0.021415,...,0.014642,0.014334,0.011662,0.010312,0.009363,0.011491,0.011093,0.010992,0.010121,0.011274


# Evaluation of different CF-methods

In [116]:
# Evaluate collaborative filtering:

# Execute preprocessing

# Filter the ratings to users that have more than 10 ratings and less than 1000
# !rerun this line in case the filtering changes
ratings = interactions[["user_id","recipe_id","rating"]]

# Filter the out the 0 ratings
ratings = ratings[ratings.rating != 0]

#Get the number of ratings per user
ratings = ratings.groupby("user_id").filter(lambda x: (len(x) > 10) & (len(x) < 1000))
ratings = ratings.groupby("recipe_id").filter(lambda x: (len(x) > 1))
#Users left
print(f"Number of users: {ratings.user_id.unique().size}\n")
print(f"Number of recipes: {ratings.recipe_id.unique().size}\n")
#Movies left
#Fraction of user that is left
print(f"Fraction of users that is left: {round(ratings.user_id.unique().size/interactions.user_id.unique().size, 2)}\n")
#Fraction of recipes that is left (we already know that all recipes are in the interaction file)
print(f"Fraction of recipes that is left: {round(ratings.recipe_id.unique().size/interactions.recipe_id.unique().size, 2)}\n")


# Split into train, test
ratings_train, ratings_test = train_test_split(ratings, test_size=0.3, random_state=42, shuffle=True, stratify=ratings.rating)

# Create a dataset object from the training data to run the grid search 
trainset = Dataset.load_from_df(ratings_train, Reader())
testset = Dataset.load_from_df(ratings_test, Reader()).build_full_trainset().build_testset()


# Prep grid search

# Params
param_grid = {#"KNNBasic" : {"k" : [20,40,60],
#                             "sim_options" : {"name" : ['cosine'],
#                                              "user_based"  : [True, False],
#                                             }
#                            },
#               "KNNWithMeans" : {"k" : [20,40,60],
#                                "sim_options" : {"name" : ['cosine'],
#                                                 "user_based"  : [True, False],
#                                                }
#                                },
              "SVD" : {"n_factors" : [5, 15, 25, 50]},
#               "NMF" : {"n_factors" : [5, 15, 25, 50]},
#               "SlopeOne" : {},
#               "Baseline" : {}
             }

# Cross-validation
cv = ShuffleSplit(n_splits = 5, test_size = 0.3, random_state=42, shuffle=True)

# Algos
algos = {"SVD" : SVD, "NMF" : NMF, "SlopeOne" : SlopeOne, "Baseline" : NormalPredictor}

# Benchmark
benchmark = []


# Evaluate the algorithms
for name, algo in algos.items():
    gs = GridSearchCV(algo, param_grid.get(name), measures=['rmse'], cv=cv, refit=True, n_jobs=-1)
    gs.fit(trainset)
    predicitons = gs.test(testset)
    final_score = accuracy.rmse(predicitons)
    benchmark.append([name, final_score, gs.best_params])

results = pd.DataFrame(benchmark, columns=["Algorithm", "Final_RMSE", "Params"]).sort_values("Final_RMSE")
results.set_index("Algorithm", inplace=True)
results

Number of users: 10935

Number of recipes: 20428

Fraction of users that is left: 0.05

Fraction of recipes that is left: 0.09

RMSE: 0.5854
RMSE: 0.6327
RMSE: 0.6619
RMSE: 0.7930


Unnamed: 0_level_0,Final_RMSE,Params
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
SVD,0.585396,{'rmse': {'n_factors': 5}}
NMF,0.632712,{'rmse': {'n_factors': 25}}
SlopeOne,0.661855,{'rmse': {}}
Baseline,0.793001,{'rmse': {}}
