In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate, train_test_split
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, recall_at_k, get_top_k_items
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from surprise import accuracy

In [71]:
ratings= pickle.load(open("svd_ratings_data.pkl","rb" ))

In [3]:
ratings

Unnamed: 0,userId,itemId,rating
0,1,110,1.0
1,11,110,3.5
2,22,110,5.0
3,24,110,5.0
4,29,110,3.0
...,...,...,...
1099674,25806,167858,5.0
1099675,25808,8452,2.0
1099676,25808,43828,4.0
1099677,25808,116973,3.5


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings, reader=reader)

train_set, test_set = train_test_split(data, test_size=.10)

In [5]:
from surprise.model_selection import KFold
from collections import defaultdict

def precision_recall_at_k(predictions, k=5, threshold=3.5):

    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls




algo = SVDpp(n_factors=300, n_epochs=30)

algo.fit(train_set)
predictions = algo.test(test_set)
precisions, recalls = precision_recall_at_k(predictions, threshold=4)

print("RMSE: {}".format(accuracy.rmse(predictions)))
print("MAE: {}".format(accuracy.mae(predictions)))
print("Precision value : {}".format(sum(prec for prec in precisions.values()) / len(precisions)))
print("Recalls value : {}".format(sum(rec for rec in recalls.values()) / len(recalls)))


RMSE: 0.8446
RMSE: 0.844614628618436
MAE:  0.6456
MAE: 0.6456302662171232
Precision value : 0.4605856583719385
Recalls value : 0.3277470396869814


In [6]:
pickle.dump(algo, open("model_svd++.pkl","wb" ))

In [7]:
model = pickle.load(open("model_svd++.pkl","rb" ))

In [8]:
content_based_data = pickle.load(open("content_based_data.pkl","rb" ))
content_based_similaritiy = pickle.load(open("content_based_similaritiy.pkl","rb" ))

In [35]:
train = train_set.all_ratings()
train = pd.DataFrame(train, columns=['uid', 'iid', 'rating'])
train.rename(columns={"uid": "userId", "iid":"itemId", "rating":"ratings"}, inplace=True)
train

Unnamed: 0,userId,itemId,ratings
0,0,0,3.0
1,0,204,4.0
2,0,932,3.0
3,0,1143,3.0
4,0,111,3.0
...,...,...,...
989706,25385,916,3.0
989707,25386,38,5.0
989708,25387,88,5.0
989709,25388,404,1.5


In [36]:
all_predictions = compute_ranking_predictions(model, train, usercol='userId', itemcol='itemId', remove_seen=True)

In [38]:
pickle.dump(all_predictions, open("all_predictions_svd++.pkl","wb" ))