<h3> Import libraries </h3>

In [1]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp,NMF
from surprise import SVD
from surprise import accuracy
from surprise import SlopeOne,CoClustering
from sklearn.preprocessing import normalize

In [2]:
train_df  = pd.read_csv('interactions_train.csv')# read csv into ratings_df dataframe
train_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [3]:
train_df = train_df.sample(10000)
train_df.shape

(10000, 6)

In [4]:
train_df.drop(["u","i","date"],axis=1,inplace=True)

In [5]:
reader = Reader(rating_scale=(0,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(train_df,reader) #load dataset into Surprise datastructure Dataset

In [6]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

<h3> Matrix Factorization </h3>

In [7]:
param_grid = {'n_factors': [50],
              'reg_all': [0.1],
              'n_epochs': [10,20]
              }

In [8]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5) 

In [58]:
gs.fit(data)

In [59]:
# best RMSE score
print(gs.best_score['rmse'])

0.9380775933043173


In [60]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_factors': 50, 'reg_all': 0.1, 'n_epochs': 20}


<h3> User based CF </h3>

In [61]:
param_grid = {'k': [ 5],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [True]}
              }

In [62]:
gs_knn = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [63]:
gs_knn.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

In [64]:
# best RMSE score
print(gs_knn.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn.best_params['rmse'])

0.9474602533214433
{'k': 5, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': True}}


<h3> Item based CF </h3>

In [65]:
param_grid = {'k': [3, 5, 10, 20],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [False]}
              }

In [66]:
gs_knn_item = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [67]:
gs_knn_item.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

In [68]:
# best RMSE score
print(gs_knn_item.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn_item.best_params['rmse'])

0.9466099005674307
{'k': 3, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}


<h3> Generating predictions using MF </h3>

In [64]:
trainset = data.build_full_trainset()
#algo.fit(trainset)
svd = gs.best_estimator['rmse']
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24053d836d8>

In [65]:
test_df = pd.read_csv('interactions_test.csv')
test_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,8937,44551,2005-12-23,4.0,2,173538
1,56680,126118,2006-10-07,4.0,16,177847
2,349752,219596,2008-04-12,0.0,26,89896
3,628951,82783,2007-11-13,2.0,45,172637
4,92816,435013,2013-07-31,3.0,52,177935


In [66]:
test_df.drop(["u","i","date"],axis=1,inplace=True)

In [67]:
user_list=[]
recipe_list=[]
est_ratings=[]
for i, row in test_df.iterrows():
    u=row['user_id'];
    i=row['recipe_id'];
    user_list.append(str(u))
    recipe_list.append(str(i))
    pred=round(svd.predict(u,i).est,1)
    est_ratings.append(pred)
    #print (u, i,pred)

In [68]:
df_submission = pd.DataFrame(list(zip(user_list ,recipe_list, est_ratings)), 
           columns =['user_id','recipe_id', 'Rating'])

In [69]:
df_submission.head()

Unnamed: 0,user_id,recipe_id,Rating
0,8937.0,44551.0,4.6
1,56680.0,126118.0,4.6
2,349752.0,219596.0,4.6
3,628951.0,82783.0,4.6
4,92816.0,435013.0,4.6


In [141]:
df_submission.to_csv("predictions.csv", index=False)