# Рекомендательная система на библиотеке surprise (Домашнее задание)


In [16]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
BASE_DIR = Path('/Users/dev/Рекомендательные системы')
os.chdir(BASE_DIR)

In [2]:
from surprise import Dataset
from surprise import Reader

In [3]:
from surprise import KNNBaseline

In [4]:
from surprise.model_selection import train_test_split

In [5]:
def Precision_at_n(df_ratings, n=3, threshold=5):
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    t=df_ratings.sort_values(by='rating', ascending=False)[0:n]
    ret_n=set(t['itemID'])
    return len(rel_n & ret_n)/n

In [6]:
def Avg_Precision_at_n(df_ratings, n=10,threshold=5):
    avg_p=0
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    
    for i in range(1,n+1):
        t=df_ratings.sort_values(by='rating', ascending=False)[0:i]
        ret_n=set(t['itemID'])
        t2=df_ratings.sort_values(by='rating', ascending=False)[i-1:i]
        ret_n2=set(t2['itemID'])
        #print(ret_n2,ret_n)
        if len((ret_n2 & rel_n))>0:
            avg_p=avg_p+len(rel_n & ret_n)/n
    return avg_p/n

In [7]:
def Recall_at_n(df_ratings, n=3,threshold=5):
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    t=df_ratings.sort_values(by='rating', ascending=False)[0:n]
    ret_n=set(t['itemID'])
    return len(rel_n & ret_n)/len(set(rel_n))

In [8]:
def MAP_at_n(df_ratings_all, n=10, threshold=5):
    users=df_ratings_all['userID'].unique()
    map_at_n=0
    for i in users:
        df_ratings=df_ratings_all[df_ratings_all.userID==i]
        map_at_n=map_at_n+Avg_Precision_at_n(df_ratings,n=n,threshold=threshold)
    return map_at_n/len(users)   

## Легкое задание
Для датасэта

```
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

```

Для каждого k= 10, 20 и 30 выбрать какой вариант KNN лучше item-based или user-based по метрике MAP_at_n (n=5)


In [9]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

In [59]:
map_list = []
count = 0
for k in range(10, 31, 10):
    for based in [True, False]:
        algo = KNNBaseline(k=k, sim_options={
                           'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
        map_list.append(MAP_at_n(df_ratings, 5))

    if map_list[count+1] > map_list[count]:
        print(
            f'При k = {k} лучше вариант item-based. MAP_at_n_item = {map_list[count+1]:.3f} , MAP_at_n_user = {map_list[count]:.3f}')
    else:
        print(
            f'При k = {k} лучше вариант user_based. MAP_at_n_item = {map_list[count+1]:.3f} , MAP_at_n_user = {map_list[count]:.3f}')
    count += 2

При k = 10 лучше вариант item-based. MAP_at_n_item = 0.145 , MAP_at_n_user = 0.137
При k = 20 лучше вариант item-based. MAP_at_n_item = 0.148 , MAP_at_n_user = 0.148
При k = 30 лучше вариант item-based. MAP_at_n_item = 0.153 , MAP_at_n_user = 0.151


## Сложное задание

Для датасэта

```
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

```

а) Для KNN(k=30) для каждого пользователя найти лучший алгоритм по Avg_Precision_at_n(7)
Cравнивая между item-based, user-based подход, а также разные меры для (косинусную, Пирсона) в KNN.
Докуметация к мерам:
https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration

б) Для каждого пользователя расчитайте рейтинги фильмов используя лучший алгоритм из пункта а)

в) Сравнить результат из б) с обычном kNN k=30 по метрике MAP (n=7).


#### а)

Цифровое кодирование алгоритмов:
- 0 - cosine, user-based
- 1 - cosine, item-based
- 2 - pearson baseline, user-based
- 3 - pearson baseline, item-based

In [11]:
apn_list = []
algo_list = []
users_list = []
df_metrics = pd.DataFrame()

for name in ['cosine', 'pearson_baseline']:
    for based in [True, False]:
        column_name = str('name:') + name + str(' user_based:') + str(based)
        algo = KNNBaseline(k=30, sim_options={
                           'name': name, 'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)

        for users in df_ratings['userID'].unique():
            apn_list.append(Avg_Precision_at_n(
                df_ratings[df_ratings['userID'] == users], 7))
            
        df_metrics[column_name] = apn_list
        apn_list = []
    
for users in df_ratings['userID'].unique():
    users_list.append(users)
        
df_metrics.insert(0, 'userID', users_list)
df_metrics['max_apn_value'] = df_metrics.max(axis=1)

for idx in df_metrics.index:
    algo_list.append(np.where(df_metrics.drop(
        ['userID', 'max_apn_value'], axis=1).iloc[idx] == df_metrics.iloc[idx].max_apn_value)[0][0])
df_metrics['algorithm number'] = algo_list

In [19]:
df_metrics.to_csv(BASE_DIR/'df_metrics.csv', sep=',', index_label='index')

In [None]:
df_metrics = pd.read_csv(BASE_DIR/'df_metrics.csv', sep=',', index_col='index')

In [20]:
df_metrics.head()

Unnamed: 0,userID,name:cosine user_based:True,name:cosine user_based:False,name:pearson_baseline user_based:True,name:pearson_baseline user_based:False,max_apn_value,algorithm number
0,782,0.020408,0.020408,0.020408,0.020408,0.020408,0
1,262,0.061224,0.061224,0.061224,0.061224,0.061224,0
2,656,0.020408,0.020408,0.020408,0.020408,0.020408,0
3,432,0.020408,0.061224,0.061224,0.061224,0.061224,1
4,167,0.122449,0.204082,0.204082,0.122449,0.204082,1


#### б)

In [40]:
idx = 0
count = 0
algo_list = []
best_rating = []
df_full = pd.DataFrame()

for name in ['cosine', 'pearson_baseline']:
    for based in [True, False]:
        algo = KNNBaseline(k=30, sim_options={
                           'name': name, 'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
        df_full['rating '+ str(count)] = df_ratings['rating']
        count += 1
        
df_full.insert(0, 'userID', df_ratings.userID)
df_full.insert(1, 'itemID', df_ratings.itemID)

for users in df_full.userID:
    algo_list.append(df_metrics[df_metrics['userID'] == users]['algorithm number'].tolist()[0])

df_full['algorithm number'] = algo_list
df_full['real_rating'] = df_ratings['real_rating']

for alg_num in df_full['algorithm number']:
    best_rating.append(df_full.iloc[idx][str('rating ') + str(alg_num)])
    idx += 1
df_full['best_rating'] = best_rating

df_full

Unnamed: 0,userID,itemID,rating 0,rating 1,rating 2,rating 3,algorithm number,real_rating,best_rating
0,782,1251,3.851205,3.424778,3.701113,3.100972,0,3.0,3.851205
1,262,418,3.364223,3.210887,3.208854,3.089342,0,3.0,3.364223
2,656,347,3.377682,2.594202,3.105893,2.525608,0,4.0,3.377682
3,432,405,3.641523,3.639760,3.748952,3.710684,1,4.0,3.639760
4,167,478,3.596635,3.953934,4.077883,4.215468,1,5.0,3.953934
...,...,...,...,...,...,...,...,...,...
29995,374,1248,2.965181,3.154500,2.643351,2.894588,3,3.0,2.894588
29996,886,709,3.827966,3.715787,3.824292,4.027782,0,3.0,3.827966
29997,184,72,3.261273,3.397545,3.017213,3.134424,0,3.0,3.261273
29998,721,69,3.955801,4.115244,4.211077,4.533745,0,4.0,3.955801


In [41]:
df_full.to_csv(BASE_DIR/'df_full.csv', sep=',', index_label='index')

In [50]:
df_full = pd.read_csv(BASE_DIR/'df_full.csv', sep=',', index_col='index')

#### в)

In [43]:
#обычный kNN k=30 по метрике MAP (n=7)
algo = KNNBaseline(k=30, sim_options={'user_based': True}, verbose=False)
predictions = algo.fit(trainset).test(testset)
df_ratings = pd.DataFrame(columns=['userID', 'itemID', 'rating', 'real_rating'])
for i in predictions:
    df_ratings = df_ratings.append({'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
MAP_at_n(df_ratings, 7)

0.1205661479862356

In [52]:
df_full = df_full[['userID','itemID','best_rating','real_rating']]
df_full = df_full.rename(columns={'best_rating': 'rating'})
MAP_at_n(df_full, 7)

0.1557989049278238

Видим, что результат из пункта б) оказался намного лучше.