<h3> Import libraries </h3>

In [50]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp,NMF
from surprise import SVD
from surprise import accuracy
from surprise import SlopeOne,CoClustering
from sklearn.preprocessing import normalize

In [51]:
train_df  = pd.read_csv('interactions_train.csv')# read csv into ratings_df dataframe
train_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [52]:
train_df = train_df.sample(10000)
train_df.shape

(10000, 6)

In [53]:
train_df.drop(["u","i","date"],axis=1,inplace=True)

In [54]:
reader = Reader(rating_scale=(0,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(train_df,reader) #load dataset into Surprise datastructure Dataset

In [55]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

<h3> Matrix Factorization </h3>

In [56]:
param_grid = {'n_factors': [50],
              'reg_all': [0.1],
              'n_epochs': [10,20]
              }

In [57]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5) 

In [58]:
gs.fit(data)

In [59]:
# best RMSE score
print(gs.best_score['rmse'])

0.9380775933043173


In [60]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_factors': 50, 'reg_all': 0.1, 'n_epochs': 20}


<h3> User based CF </h3>

In [61]:
param_grid = {'k': [ 5],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [True]}
              }

In [62]:
gs_knn = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [63]:
gs_knn.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

In [64]:
# best RMSE score
print(gs_knn.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn.best_params['rmse'])

0.9474602533214433
{'k': 5, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': True}}


<h3> Item based CF </h3>

In [65]:
param_grid = {'k': [3, 5, 10, 20],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [False]}
              }

In [66]:
gs_knn_item = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [None]:
gs_knn_item.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

In [None]:
# best RMSE score
print(gs_knn_item.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn_item.best_params['rmse'])

<h3> Generating predictions using MF </h3>

In [64]:
trainset = data.build_full_trainset()
#algo.fit(trainset)
svd = gs.best_estimator['rmse']
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24053d836d8>

In [65]:
test_df = pd.read_csv('interactions_test.csv')
test_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,8937,44551,2005-12-23,4.0,2,173538
1,56680,126118,2006-10-07,4.0,16,177847
2,349752,219596,2008-04-12,0.0,26,89896
3,628951,82783,2007-11-13,2.0,45,172637
4,92816,435013,2013-07-31,3.0,52,177935


In [66]:
test_df.drop(["u","i","date"],axis=1,inplace=True)

In [67]:
user_list=[]
recipe_list=[]
est_ratings=[]
for i, row in test_df.iterrows():
    u=row['user_id'];
    i=row['recipe_id'];
    user_list.append(str(u))
    recipe_list.append(str(i))
    pred=round(svd.predict(u,i).est,1)
    est_ratings.append(pred)
    #print (u, i,pred)

In [68]:
df_submission = pd.DataFrame(list(zip(user_list ,recipe_list, est_ratings)), 
           columns =['user_id','recipe_id', 'Rating'])

In [69]:
df_submission.head()

Unnamed: 0,user_id,recipe_id,Rating
0,8937.0,44551.0,4.6
1,56680.0,126118.0,4.6
2,349752.0,219596.0,4.6
3,628951.0,82783.0,4.6
4,92816.0,435013.0,4.6


In [141]:
df_submission.to_csv("predictions.csv", index=False)

<h3> Nutrition </h3>

In [154]:
raw_recipes  = pd.read_csv('RAW_recipes.csv',index_col=1)# read csv into ratings_df dataframe
raw_recipes.head()

Unnamed: 0_level_0,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
137739,arriba baked winter squash mexican style,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
31490,a bit different breakfast pizza,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
112140,all in the kitchen chili,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
59389,alouette potatoes,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
44061,amish tomato ketchup for canning,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [155]:
raw_recipes = raw_recipes.sample(100000)
raw_recipes.shape

(100000, 11)

In [156]:
raw_recipes = raw_recipes.drop(columns=['name','minutes', 'contributor_id', 'submitted','steps','description'])
raw_recipes.head()

Unnamed: 0_level_0,tags,nutrition,n_steps,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
166712,"['15-minutes-or-less', 'time-to-make', 'course...","[436.6, 45.0, 32.0, 33.0, 27.0, 51.0, 9.0]",10,"['hardboiled egg', 'green onion', 'salt', 'fre...",7
66082,"['30-minutes-or-less', 'time-to-make', 'course...","[347.9, 30.0, 35.0, 33.0, 48.0, 34.0, 6.0]",6,"['cucumbers', 'tuna', 'hard-boiled eggs', 'che...",10
429530,"['weeknight', '60-minutes-or-less', 'time-to-m...","[225.5, 9.0, 59.0, 21.0, 11.0, 9.0, 13.0]",8,"['all-bran cereal', 'quick-cooking oats', 'rai...",12
283115,"['time-to-make', 'course', 'main-ingredient', ...","[635.7, 52.0, 9.0, 18.0, 67.0, 78.0, 5.0]",14,"['chicken breasts', 'truffle oil', 'shallots',...",18
324197,"['30-minutes-or-less', 'time-to-make', 'course...","[743.9, 42.0, 45.0, 43.0, 84.0, 54.0, 27.0]",6,"['dry penne pasta', 'pasta sauce', 'tuna in ve...",7


In [157]:
raw_recipes.shape


(100000, 5)

In [158]:
raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] = raw_recipes.nutrition.str.split(",",expand=True) 
raw_recipes['calories'] = raw_recipes['calories'].apply(lambda x: x.replace("[" ,""))
raw_recipes['carbohydrates'] = raw_recipes['carbohydrates'].apply(lambda x: x.replace("]" ,""))
raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] =  raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype(float)

In [159]:
raw_recipes.head()

Unnamed: 0_level_0,tags,nutrition,n_steps,ingredients,n_ingredients,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
166712,"['15-minutes-or-less', 'time-to-make', 'course...","[436.6, 45.0, 32.0, 33.0, 27.0, 51.0, 9.0]",10,"['hardboiled egg', 'green onion', 'salt', 'fre...",7,436.6,45.0,32.0,33.0,27.0,51.0,9.0
66082,"['30-minutes-or-less', 'time-to-make', 'course...","[347.9, 30.0, 35.0, 33.0, 48.0, 34.0, 6.0]",6,"['cucumbers', 'tuna', 'hard-boiled eggs', 'che...",10,347.9,30.0,35.0,33.0,48.0,34.0,6.0
429530,"['weeknight', '60-minutes-or-less', 'time-to-m...","[225.5, 9.0, 59.0, 21.0, 11.0, 9.0, 13.0]",8,"['all-bran cereal', 'quick-cooking oats', 'rai...",12,225.5,9.0,59.0,21.0,11.0,9.0,13.0
283115,"['time-to-make', 'course', 'main-ingredient', ...","[635.7, 52.0, 9.0, 18.0, 67.0, 78.0, 5.0]",14,"['chicken breasts', 'truffle oil', 'shallots',...",18,635.7,52.0,9.0,18.0,67.0,78.0,5.0
324197,"['30-minutes-or-less', 'time-to-make', 'course...","[743.9, 42.0, 45.0, 43.0, 84.0, 54.0, 27.0]",6,"['dry penne pasta', 'pasta sauce', 'tuna in ve...",7,743.9,42.0,45.0,43.0,84.0,54.0,27.0


In [160]:
raw_recipes = raw_recipes.drop(columns=['tags','nutrition', 'n_steps', 'ingredients','n_ingredients'])
raw_recipes.head()

Unnamed: 0_level_0,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
166712,436.6,45.0,32.0,33.0,27.0,51.0,9.0
66082,347.9,30.0,35.0,33.0,48.0,34.0,6.0
429530,225.5,9.0,59.0,21.0,11.0,9.0,13.0
283115,635.7,52.0,9.0,18.0,67.0,78.0,5.0
324197,743.9,42.0,45.0,43.0,84.0,54.0,27.0


In [161]:
raw_recipes.reset_index('id')
raw_recipes.head()

Unnamed: 0_level_0,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
166712,436.6,45.0,32.0,33.0,27.0,51.0,9.0
66082,347.9,30.0,35.0,33.0,48.0,34.0,6.0
429530,225.5,9.0,59.0,21.0,11.0,9.0,13.0
283115,635.7,52.0,9.0,18.0,67.0,78.0,5.0
324197,743.9,42.0,45.0,43.0,84.0,54.0,27.0


In [163]:
raw_recipes_normalized = pd.DataFrame(normalize(raw_recipes, axis=0))
raw_recipes_normalized.columns = raw_recipes.columns
raw_recipes_normalized.index = raw_recipes.index
raw_recipes_normalized.head()

Unnamed: 0_level_0,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
166712,0.001418,0.001487,0.000333,0.00082,0.001199,0.001412,0.00074
66082,0.00113,0.000991,0.000365,0.00082,0.002131,0.000941,0.000493
429530,0.000732,0.000297,0.000615,0.000522,0.000488,0.000249,0.001068
283115,0.002064,0.001718,9.4e-05,0.000447,0.002974,0.002159,0.000411
324197,0.002415,0.001388,0.000469,0.001069,0.003729,0.001495,0.002219
