# Surprise homework

In [1]:
var = 31

In [2]:
import io 
import numpy as np
from collections import defaultdict
from surprise import Dataset 
from surprise import accuracy 
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate 
from surprise.model_selection import train_test_split

### Functions

In [3]:
# используйте полезные функции из FAQ

from surprise import SVD
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k, threshold):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [4]:
def get_top_n(predictions, n):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [5]:
def read_item_info():
    """Read the u.item file from MovieLens 100-k dataset and return 
    mapping to convert raw ids into movie info.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_info = {}

    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_info[line[0]] = {'name' : line[1], 'date' : line[2]}

    return rid_to_info

### Load data

In [6]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)

print('trainset:', trainset)
print('testset len: ', len(testset))

trainset: <surprise.trainset.Trainset object at 0x119ed9710>
testset len:  25000


In [7]:
rid_to_info = read_item_info()
rid_to_info['100'] # example

{'name': 'Fargo (1996)', 'date': '14-Feb-1997'}

### Define algorithms

In [8]:
# внимательно изучите документацию по метрикам и алгоритмам
from surprise import SVD

from surprise import NormalPredictor
from surprise import KNNWithMeans
from surprise import KNNBaseline

### Select algorithm

1. Оценить по метрике RMSE с помощью функции cross_validate следующие алгоритмы:

In [9]:
results = []

In [10]:
def test_algo(algo, metrica = None):
    print("Проверяем", type(algo).__name__ + ('по метрике ' + metrica if metrica != None else ''), '\n')
    
    algo.fit(trainset)
    
    predictions = algo.test(testset)
    
    cross_validate(algo, 
                   data, 
                   measures=['RMSE'], 
                   cv=5, 
                   verbose=True)
    
    return [accuracy.rmse(predictions), algo, metrica]
    
def test_knn(metrica):
    sim_options = {'name': metrica }
    algo = KNNWithMeans(k=30,sim_options=sim_options)
    return test_algo(algo, metrica)

In [11]:
# прогнозирование случайного рейтинга на основе распределения всех рейтингов в наборе;
results += [test_algo(NormalPredictor())]

Проверяем NormalPredictor 

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5147  1.5202  1.5242  1.5269  1.5135  1.5199  0.0052  
Fit time          0.12    0.14    0.13    0.15    0.14    0.14    0.01    
Test time         0.19    0.17    0.12    0.18    0.18    0.17    0.03    
RMSE: 1.5159


In [12]:
# user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика косинуса;
results += [test_knn('cosine')]

Проверяем KNNWithMeansпо метрике cosine 

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9492  0.9584  0.9639  0.9582  0.9548  0.9569  0.0048  
Fit time          0.78    0.78    0.82    0.79    0.80    0.79    0.01    
Test time         2.95    2.91    2.94    2.92    2.99    2.94    0.03    
RMSE: 0.9590


In [13]:
# user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика Mean Squared Difference ;
results += [test_knn('msd')]

Проверяем KNNWithMeansпо метрике msd 

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9464  0.9526  0.9573  0.9525  0.9529  0.9523  0.0035  
Fit time          0.27    0.33    0.37    0.36    0.30    0.32    0.04    
Test time         2.94    2.94    2.86    2.92    2.94    2.92    0.03    
RMSE: 0.9532


In [14]:
# user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика корреляция Пирсона;
results += [test_knn('pearson')]

Проверяем KNNWithMeansпо метрике pearson 

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9610  0.9430  0.9468  0.9577  0.9528  0.9523  0.0067  
Fit time          1.16    1.14    1.13    1.13    1.11    1.13    0.02    
Test time         3.04    3.01    2.94    2.85    2.87    2.94    0.07    
RMSE: 0.9543


In [15]:
# SVD алгоритм.
results += [test_algo(SVD())]

Проверяем SVD 

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9380  0.9380  0.9331  0.9442  0.9346  0.9376  0.0038  
Fit time          3.92    3.79    3.69    3.93    3.74    3.82    0.10    
Test time         0.13    0.20    0.20    0.12    0.20    0.17    0.04    
RMSE: 0.9368


In [16]:
results.sort(key = lambda x: x[0])
for rmse, algo, metrica in results:
    print(round(rmse, 3), "= rmse для алгоритма", type(algo).__name__, "с метрикой", metrica) 
    
print()
best_rmse, best_algo, best_metrica = results[0]
print("Лучший алгоритм по rmse:", type(best_algo).__name__, "с метрикой", best_metrica)

0.937 = rmse для алгоритма SVD с метрикой None
0.953 = rmse для алгоритма KNNWithMeans с метрикой msd
0.954 = rmse для алгоритма KNNWithMeans с метрикой pearson
0.959 = rmse для алгоритма KNNWithMeans с метрикой cosine
1.516 = rmse для алгоритма NormalPredictor с метрикой None

Лучший алгоритм по rmse: SVD с метрикой None


### Calculate precision@k and recall@k

2. Для лучшего алгоритма по метрике RMSE рассчитать метрики precision@k and recall@k для k=5 и порога отсечения 3.52, усредненные по всем пользователям.

In [17]:
k = 5
threshold = 3.52

In [18]:
predictions = best_algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k, threshold)

mean_precision = np.mean(list(precisions.values()))
mean_recall = np.mean(list(recalls.values()))

print("precision@k:", round(mean_precision, 3))
print("recall@k:   ", round(mean_recall, 3))

precision@k: 0.864
recall@k:    0.433


### Predict

3. Для заданного пользователя (номер в списке) c помощью лучшего алгоритма по метрике RMSE вывести топ-5 рекомендаций (те фильмы, для которых у пользователя нет оценки) с названиями, датой выхода и рейтингом.

In [19]:
cnt = 5

In [21]:
# обратите внимание на функцию build_anti_testset
# Predict ratings for all pairs (u, i) that are NOT in the training set.

predictions = best_algo.test(trainset.build_anti_testset())

In [22]:
top_n = get_top_n(predictions, n=cnt)

In [23]:
print("var", var)
top_n[str(var)]

var 31


[('89', 4.901065426017591),
 ('127', 4.811790749214374),
 ('603', 4.784981009027717),
 ('169', 4.733130883170019),
 ('318', 4.726462758069867)]

In [26]:
def get_info(item):
    rid, rate = item
    info = rid_to_info[rid]
    return [rid, info['name'], info['date'], rate]
    
result = list(map(get_info, top_n[str(var)]))

In [27]:
print("User", var)

max_name_length = len(max(result, key=lambda x: len(x[1]))[1])

for rid, name, date, rate in result:
    print('   ', rid.ljust(5), name.ljust(max_name_length), date, round(rate, 3))

User 31
    89    Blade Runner (1982)        01-Jan-1982 4.901
    127   Godfather, The (1972)      01-Jan-1972 4.812
    603   Rear Window (1954)         01-Jan-1954 4.785
    169   Wrong Trousers, The (1993) 01-Jan-1993 4.733
    318   Schindler's List (1993)    01-Jan-1993 4.726
