<h3> Import libraries </h3>

In [74]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp,NMF
from surprise import SVD
from surprise import accuracy
from surprise import SlopeOne,CoClustering
from sklearn.preprocessing import normalize

In [75]:
train_df  = pd.read_csv('interactions_train.csv')# read csv into ratings_df dataframe
train_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [76]:
train_new_df  = pd.read_csv('interactions_train_new.csv')# read csv into ratings_df dataframe
train_new_df.drop(["u","i","date","rating"],axis=1,inplace=True)
train_new_df=train_new_df.sample(30000)
train_new_df.head()

Unnamed: 0,user_id,recipe_id,without_0_rating
433332,858860,53914,5.0
678700,1530179,155290,5.0
499624,542159,351657,5.0
58135,68715,96342,5.0
497433,262312,251137,5.0


In [77]:
train_df = train_df.sample(30000)
train_df.shape

(30000, 6)

In [78]:
train_df.drop(["u","i","date"],axis=1,inplace=True)

In [79]:
reader = Reader(rating_scale=(0,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(train_df,reader) #load dataset into Surprise datastructure Dataset

In [80]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [81]:
reader_0 = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data_0=Dataset.load_from_df(train_new_df,reader) #load dataset into Surprise datastructure Dataset
#create training set
trainingSet_0, testSet_0 = train_test_split(data_0, test_size=0.2, train_size=None, random_state=None, shuffle=True)

<h3> Matrix Factorization </h3>
<br/> <b>1. On training set with rating scale: 0-5</b>

In [82]:
model_metrics=pd.DataFrame()
model_metrics['Rating Scale']=""
model_metrics['Algorithm']=""
model_metrics['RMSE']=0

param_grid = {'n_factors': [50],
              'reg_all': [0.1],
              'n_epochs': [10,20]
              }

In [83]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5) 

In [84]:
gs.fit(data)

In [85]:
# best RMSE score
print(gs.best_score['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['0-5', "Matrix Factorization", gs.best_score['rmse']]


0.9587901705722924


In [86]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_factors': 50, 'reg_all': 0.1, 'n_epochs': 20}



<b> 2. On training set with rating scale: 1-5 </b>

In [87]:
gs.fit(data_0)
print(gs.best_score['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['1-5', "Matrix Factorization", gs.best_score['rmse']]

0.6259739042062784


In [88]:
model_metrics

Unnamed: 0,Rating Scale,Algorithm,RMSE
0,0-5,Matrix Factorization,0.95879
1,1-5,Matrix Factorization,0.625974


<b>Clearly, the data with rating scale 1-5 improves the RMSE by a huge factor.</b>

<h3> User based CF </h3>

<b> 1. On training set with rating scale: 0-5 </b>

In [91]:
param_grid = {'k': [ 5],
              'sim_options': {'name': ['pearson','cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [True]}
              }

In [92]:
gs_knn = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [93]:
gs_knn.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [94]:
# best RMSE score
print(gs_knn.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_knn.best_params['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['0-5', "User based CF", gs_knn.best_score['rmse']]


0.9744892472307862
{'k': 5, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': True}}


<b> 2. On training set with rating scale: 1-5 </b>

In [95]:
gs_knn.fit(data_0)
print(gs_knn.best_score['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['1-5', "User based CF", gs_knn.best_score['rmse']]


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.6425339258317753


In [97]:
model_metrics

Unnamed: 0,Rating Scale,Algorithm,RMSE
0,0-5,Matrix Factorization,0.95879
1,1-5,Matrix Factorization,0.625974
2,0-5,User based CF,0.974489
3,1-5,User based CF,0.642534


<h3> Item based CF </h3>

<b> 1. On training set with rating scale: 0-5 </b>

In [104]:
param_grid = {'k': [3, 5, 10, 20],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'min_support': [1, 5],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [False]}
              }

In [100]:
gs_knn_item = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [101]:
gs_knn_item.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [102]:
# best RMSE score
print(gs_knn_item.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn_item.best_params['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['0-5', "Item based CF", gs_knn_item.best_score['rmse']]


0.9744427970606091
{'k': 3, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}


<b> 2. On training set with rating scale: 1-5 </b>

In [105]:
#gs_knn = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 
gs_knn_item.fit(data_0)
print(gs_knn_item.best_score['rmse'])
model_metrics.loc[len(model_metrics.index)] = ['1-5', "Item based CF", gs_knn_item.best_score['rmse']]


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [106]:
model_metrics

Unnamed: 0,Rating Scale,Algorithm,RMSE
0,0-5,Matrix Factorization,0.95879
1,1-5,Matrix Factorization,0.625974
2,0-5,User based CF,0.974489
3,1-5,User based CF,0.642534
4,0-5,Item based CF,0.974443
5,1-5,Item based CF,0.642462


<b>The best model is Matrix factorization with the least RMSE. It is also observed that the cleaned data without ratings '0' reduces the RMSE by almost 40%. </b>

<h3> Generating predictions using MF </h3>

In [118]:
train_new_df  = pd.read_csv('interactions_train_new.csv')# read csv into ratings_df dataframe
train_new_df.drop(["u","i","date","rating"],axis=1,inplace=True)
train_new_df['without_0_rating']=train_new_df['without_0_rating'].astype(int)
data_0=Dataset.load_from_df(train_new_df,reader)
trainset = data_0.build_full_trainset()
#algo.fit(trainset)
svd = gs.best_estimator['rmse']
final_model=svd.fit(trainset)

In [116]:
train_new_df.head()

Unnamed: 0,user_id,recipe_id,without_0_rating
0,2046,4684,5
1,2046,517,5
2,1773,7435,5
3,1773,278,4
4,2046,3431,5


In [128]:
test_df = pd.read_csv('interactions_test_new.csv')
test_df.drop(["u","i","date","rating"],axis=1,inplace=True)
test_df['without_0_rating']=test_df['without_0_rating'].astype(int)
test_df.head()

Unnamed: 0,user_id,recipe_id,without_0_rating
0,8937,44551,4
1,56680,126118,4
2,349752,219596,5
3,628951,82783,2
4,92816,435013,3


In [138]:
user_list=[]
recipe_list=[]
act_rating=[]
est_ratings=[]
for i, row in test_df.iterrows():
    u=row['user_id'];
    i=row['recipe_id'];
    actual_rating=row['without_0_rating']
    user_list.append(str(u))
    recipe_list.append(str(i))
    act_rating.append(actual_rating)
    pred=round(final_model.predict(u,i).est,1)
    est_ratings.append(pred)
    #print (u, i,pred)

In [139]:
df_submission = pd.DataFrame(list(zip(user_list ,recipe_list,act_rating, est_ratings)), 
           columns =['user_id','recipe_id','Actual Rating', 'Predicted Rating'])

In [140]:
df_submission.head()

Unnamed: 0,user_id,recipe_id,Actual Rating,Predicted Rating
0,8937,44551,4,4.3
1,56680,126118,4,4.8
2,349752,219596,5,4.7
3,628951,82783,2,4.8
4,92816,435013,3,4.6


In [141]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(df_submission['Actual Rating'], df_submission['Predicted Rating'], squared=False)

In [142]:
print(rms)

0.878753176793466


In [143]:
df_submission.to_csv("predictions_MF.csv", index=False)