In [20]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np
from surprise import Dataset, Reader, KNNWithMeans, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split


## Reading Data

In [2]:
reviews_train = pd.read_csv('../data/reviews_train.csv', usecols=['RecipeId', 'AuthorId', 'Rating'])
reviews_test = pd.read_csv('../data/reviews_test.csv', usecols=['RecipeId', 'AuthorId', 'Rating'])


## Constructing base model

### SVD model

In [21]:
svd = SVD()
reader = Reader(rating_scale=(1,5))
# Loads Pandas dataframe
data = Dataset.load_from_df(reviews_train.loc[:10000, ["AuthorId", "RecipeId", "Rating"]], reader)
#data = data.build_full_trainset()
#data_test = Dataset.load_from_df(reviews_test.loc[:100, ["AuthorId", "RecipeId", "Rating"]], reader)
trainset, testset = train_test_split(data, test_size=0.2)

In [9]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6137  0.6574  0.5945  0.6319  0.5808  0.6157  0.0271  
MAE (testset)     0.4335  0.4495  0.4278  0.4405  0.4217  0.4346  0.0097  
Fit time          0.20    0.22    0.15    0.14    0.14    0.17    0.04    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.61371582, 0.65735064, 0.59453465, 0.63192718, 0.58084318]),
 'test_mae': array([0.43353038, 0.44946592, 0.42775216, 0.44050977, 0.42167406]),
 'fit_time': (0.19733452796936035,
  0.22482752799987793,
  0.1524062156677246,
  0.136946439743042,
  0.13508152961730957),
 'test_time': (0.010045051574707031,
  0.008120059967041016,
  0.007194995880126953,
  0.008453130722045898,
  0.006524324417114258)}

In [22]:

svd.fit(trainset)
predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6007


0.6006939724617784

In [33]:
svd.predict(1000,2000)

Prediction(uid=1000, iid=2000, r_ui=None, est=4.69875, details={'was_impossible': False})

### KNN model

In [28]:
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data_train = Dataset.load_from_df(reviews_train.loc[:10000, ["AuthorId", "RecipeId", "Rating"]], reader)
data_test = Dataset.load_from_df(reviews_test.loc[:100, ["AuthorId", "RecipeId", "Rating"]], reader)

# To use user-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
trainingSet = data_train.build_full_trainset()
testingSet = data_test.build_full_trainset()
algo = KNNWithMeans(sim_options=sim_options)

In [29]:
# fitting the model
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fb927ae2c90>

In [31]:
#prediction = algo.predict(1634, 5466)
algo.test(testingSet)

TypeError: 'Trainset' object is not iterable

In [43]:
trainingSet.ur

defaultdict(list,
            {0: [(0, 5.0),
              (51, 5.0),
              (85, 5.0),
              (597, 5.0),
              (2817, 5.0),
              (2946, 3.0),
              (3830, 5.0),
              (4011, 5.0),
              (4196, 5.0),
              (4498, 5.0),
              (4736, 4.0),
              (5912, 5.0),
              (6215, 4.0),
              (7007, 4.0),
              (5757, 5.0),
              (4688, 5.0),
              (7893, 5.0)],
             1: [(1, 5.0),
              (1651, 5.0),
              (3113, 5.0),
              (3326, 4.0),
              (4552, 5.0),
              (6645, 5.0),
              (7728, 5.0)],
             2: [(2, 5.0), (5044, 5.0)],
             3: [(3, 4.0),
              (1814, 5.0),
              (3026, 5.0),
              (4151, 4.0),
              (6417, 4.0),
              (6455, 4.0),
              (7030, 5.0),
              (1010, 4.0)],
             4: [(4, 5.0)],
             5: [(5, 4.0)],
             6: [(6, 5.

In [49]:
reviews_test[reviews_test['AuthorId']==2946]

Unnamed: 0,RecipeId,AuthorId,Rating


## Parameter Tuning

In [None]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])