In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

from tqdm import tqdm

In [2]:
data = pd.read_csv('../../data/rec_sys/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


В рекомендательных системах корректнее использовать train-test split по времени, а не случайно.

Возмём последние 3 недели в качестве теста.

In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]
data_train

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0


# Оценивание
За выполнени каждого задания 1 балл

4 балла -> отл

3 балла -> хор

И тд

## Задание 0. Товар 999999
На вебинаре мы использовали товар 999999 - что это за товар?  
Зачем он нужен?  
Используя этот товар мы смещаем качество рекомендаций.
В какую сторону?   
Можно ли удалить этот товар?   
Уберите этот товар и сравните с качеством на семинаре.

**Ответ**:
- В товар 999999 мы группируем все товары, которые не вошли в топ5000 товаров.
- В датасете ~90к товаров, а item-item модели ОЧЕНЬ долго предсказывают. Поэтому для ускорения вычислений мы предсказываем только для топ5000 товаров.

**Дальнейшие ответы - ниже после расчётов.**

Попробуем удалить товар 999999 и посмотреть что из этого выйдет.

Для начала повторим все шаги вместе с товаром 999999.

In [4]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
popularity.head(5)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [5]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [6]:
top_5000

[6534178,
 6533889,
 6534166,
 6544236,
 1404121,
 397896,
 1426702,
 5703832,
 480014,
 5668996,
 6410462,
 420647,
 5747420,
 731106,
 5716076,
 707683,
 202291,
 1153346,
 1082185,
 5747233,
 6410464,
 545926,
 995242,
 2848087,
 1388206,
 1029743,
 5712216,
 5850988,
 1133018,
 1106523,
 1007195,
 981760,
 5845857,
 883404,
 1127831,
 2690723,
 866227,
 995785,
 860776,
 951590,
 5569230,
 908531,
 916122,
 826249,
 1098066,
 862349,
 1058997,
 1044078,
 904360,
 840361,
 923746,
 1126899,
 849843,
 961554,
 1005186,
 820165,
 1053690,
 844179,
 844165,
 1070820,
 1065593,
 834484,
 994928,
 833715,
 859075,
 1022003,
 1013321,
 938700,
 854852,
 986947,
 5569471,
 927191,
 1071939,
 1096036,
 1004906,
 986912,
 1080414,
 914190,
 908846,
 962229,
 8090521,
 1085604,
 911878,
 1092026,
 866211,
 1068719,
 878996,
 8090537,
 1081177,
 847270,
 929668,
 1024306,
 909894,
 907014,
 903325,
 910032,
 1095275,
 833025,
 862139,
 962568,
 953476,
 847982,
 999971,
 968215,
 1028816,
 976

In [7]:
# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

# создаём сводную таблицу: в строках - 'user_id', в столбцах - 'item_id'.
# и считаем количество значения 'quantity'
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2498,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# всё что больше нуля делаем единицей
user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix. sparse matrix позволяют более эффективно хоранить разреженные данные
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Посмотрим, какими числами заданы товары. Они довольно большие.

In [9]:
np.sort(data.item_id.unique())

array([   25671,    26081,    26093, ..., 18000012, 18024155, 18024556],
      dtype=int64)

In [10]:
# перенумеруем пользователей и товары, чтобы они задавались числами от 0 до номера количества товаров.
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [11]:
# посмотрим то, как мы переименовали item_id в id
itemid_to_id

{202291: 0,
 397896: 1,
 420647: 2,
 480014: 3,
 545926: 4,
 707683: 5,
 731106: 6,
 818980: 7,
 819063: 8,
 819227: 9,
 819255: 10,
 819304: 11,
 819308: 12,
 819330: 13,
 819518: 14,
 819594: 15,
 819643: 16,
 819765: 17,
 819840: 18,
 819845: 19,
 819927: 20,
 819978: 21,
 820082: 22,
 820122: 23,
 820165: 24,
 820291: 25,
 820301: 26,
 820321: 27,
 820361: 28,
 820486: 29,
 820518: 30,
 820560: 31,
 820701: 32,
 820895: 33,
 821025: 34,
 821083: 35,
 821200: 36,
 821209: 37,
 821219: 38,
 821344: 39,
 821464: 40,
 821556: 41,
 821562: 42,
 821695: 43,
 821730: 44,
 821735: 45,
 821787: 46,
 821867: 47,
 821976: 48,
 822049: 49,
 822073: 50,
 822101: 51,
 822140: 52,
 822178: 53,
 822225: 54,
 822241: 55,
 822339: 56,
 822346: 57,
 822407: 58,
 822517: 59,
 822524: 60,
 822646: 61,
 822677: 62,
 822739: 63,
 822785: 64,
 822936: 65,
 822965: 66,
 823099: 67,
 823176: 68,
 823356: 69,
 823704: 70,
 823721: 71,
 823758: 72,
 823775: 73,
 823862: 74,
 823915: 75,
 823990: 76,
 824005: 

Обучаем модель и получаем рекомендации

In [12]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей и количество потоков (ядер процессора)

## item-item recommender ожидает, что мы подадим в столбцах - user, в строках item. Поэтому транспонируем
model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, # не рекомендуем то, что он уже покупал
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 811 ms


In [13]:
recs

[(2381, 78679.0),
 (3408, 72173.0),
 (2148, 57995.0),
 (3947, 17272.0),
 (3587, 14417.0)]

In [14]:
[id_to_itemid[rec[0]] for rec in recs]

[999999, 1082185, 981760, 1127831, 1098066]

Видим что наш товар 999999 вырвался вперёд.

Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)

In [15]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

# перевод numpy массива с списки - сериализация
result['actual'] = result['actual'].apply(lambda x: list(x))

result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


Рассчитываем для каждого пользователя его рекомендации и добавляем к итоговой таблице сравнений метрик.

In [16]:
%%time

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 58 ms


In [17]:
result.head(3)

Unnamed: 0,user_id,actual,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[999999, 1082185, 981760, 1127831, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[999999, 1082185, 981760, 1098066, 995242]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[999999, 1082185, 981760, 1127831, 995242]"


Оценим алгоритм

In [18]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    #print('bought_list =', bought_list)
    recommended_list = np.array(recommended_list)
    #print('recommended_list =', recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    #print('recommended_list k =', recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [19]:
result.apply(lambda x: precision_at_k(x['itemitem'], x['actual'],  5), axis=1).mean()

0.13692458374142857

**Избавимся от 999999**

Перезагружаем данные

In [20]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [21]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head(5)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [22]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [23]:
# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
# Не производим замену товара не из top_5000 на 999999
#data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999



# создаём сводную таблицу: в строках - 'user_id', в столбцах - 'item_id'.
# и считаем количество значения 'quantity'
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix

item_id,25671,26081,26093,26190,26355,26426,26540,26601,26636,26691,...,17328742,17329473,17329749,17330255,17330511,17381856,17382205,17383227,17827644,17829232
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# всё что больше нуля делаем единицей
user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix. sparse matrix позволяют более эффективно хоранить разреженные данные
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,25671,26081,26093,26190,26355,26426,26540,26601,26636,26691,...,17328742,17329473,17329749,17330255,17330511,17381856,17382205,17383227,17827644,17829232
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
np.sort(data.item_id.unique())

array([   25671,    26081,    26093, ..., 18000012, 18024155, 18024556],
      dtype=int64)

In [26]:
# перенумеруем пользователей и товары, чтобы они задавались числами от 0 до номера количества товаров.
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [27]:
# посмотрим то, как мы переименовали item_id в id
itemid_to_id

{25671: 0,
 26081: 1,
 26093: 2,
 26190: 3,
 26355: 4,
 26426: 5,
 26540: 6,
 26601: 7,
 26636: 8,
 26691: 9,
 26738: 10,
 26889: 11,
 26941: 12,
 27021: 13,
 27030: 14,
 27152: 15,
 27158: 16,
 27159: 17,
 27323: 18,
 27334: 19,
 27346: 20,
 27404: 21,
 27479: 22,
 27491: 23,
 27503: 24,
 27509: 25,
 27510: 26,
 27522: 27,
 27558: 28,
 27633: 29,
 27657: 30,
 27658: 31,
 27686: 32,
 27695: 33,
 27697: 34,
 27732: 35,
 27735: 36,
 27745: 37,
 27754: 38,
 27760: 39,
 27762: 40,
 27764: 41,
 27767: 42,
 27812: 43,
 27860: 44,
 27861: 45,
 27864: 46,
 27894: 47,
 27923: 48,
 27925: 49,
 27933: 50,
 27951: 51,
 27958: 52,
 27960: 53,
 27978: 54,
 28015: 55,
 28018: 56,
 28041: 57,
 28102: 58,
 28116: 59,
 28117: 60,
 28143: 61,
 28157: 62,
 28158: 63,
 28165: 64,
 28186: 65,
 28192: 66,
 28200: 67,
 28208: 68,
 28219: 69,
 28267: 70,
 28268: 71,
 28272: 72,
 28304: 73,
 28326: 74,
 28346: 75,
 28347: 76,
 28376: 77,
 28377: 78,
 28424: 79,
 28453: 80,
 28455: 81,
 28513: 82,
 28573: 83,
 2

In [28]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей и количество потоков (ядер процессора)

## item-item recommender ожидает, что мы подадим в столбцах - user, в строках item. Поэтому транспонируем
model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, # не рекомендуем то, что он уже покупал
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))


Wall time: 16.7 s


In [29]:
[id_to_itemid[rec[0]] for rec in recs]

[1082185, 981760, 1098066, 1127831, 995242]

Рассчитываем для каждого пользователя его рекомендации и добавляем к итоговой таблице сравнений метрик.

In [30]:
%%time

result['itemitem without 9999999'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 209 ms


Оценим алгоритм без 999999

In [31]:
result.apply(lambda x: precision_at_k(x['itemitem without 9999999'], x['actual'],  5), axis=1).mean()

0.15406464250734386

**Используя этот товар мы смещаем качество рекомендаций. В какую сторону?**

Мы смещаем качество рекомендаций в худшую сторону.

**Можно ли удалить этот товар?**

Да.

**Уберите этот товар и сравните с качеством на семинаре**

Precision алгоритма с товаром 999999 составил 0.1369. Без этого товара 0.1541. Качество упало на 1.7%

## Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. вес = log(sales_sum товара)
- Придумайте пример 3 весов, посчитайте weighted_random_recommendation для разных весов

Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)

In [32]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

# перевод numpy массива с списки - сериализация
result['actual'] = result['actual'].apply(lambda x: list(x))

result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


Проверяем не появились ли в тестовом датасете новые пользователи. В этом случае не совпаёт количество строк. Проблема холодного старта.

In [33]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


Предскажем случайные рекомендации

In [34]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [35]:
%%time

items = data_train.item_id.unique()
print(items)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['rand rec'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(5)

[ 1004906  1033142  1036325 ... 15722756 17170636 15716393]
Wall time: 4.54 s


Unnamed: 0,user_id,actual,rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[958826, 824399, 98098, 15783020, 15660208]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13512876, 713618, 944068, 15926456, 13215961]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1669677, 1119563, 877368, 851150, 15630089]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[12604783, 17291554, 12263551, 9188792, 1008610]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1077827, 1767023, 1525303, 1055294, 830593]"


In [36]:
# Проверяем сколько рекомендованных товаров входят в список актуальных товаров
hit_rate = 0
for i in range(result.shape[0]):
    flags = np.isin(result.loc[i,'actual'], result.loc[i,'rand rec'])
    hit_rate += (flags.sum() > 0).astype(int)
hit_rate

11

In [37]:
def weighted_random_recommendation(items, p, n=5):
    """Случайные рекоммендации
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    
    recs = np.random.choice(items, size=n, replace=False, p=p)
    
    return recs.tolist()

In [38]:
# функция для генерации items и массива вероятностей
def data_prep(df, func):
    # создаём список уникальных товаров
    items = df.item_id.unique()
    # print('items 1 = ', items, items.size)

    # переводим в np массив
    items = np.array(items)

    # создаём датафрейм с количеством каждого товара
    count_df = df.groupby('item_id').item_id.agg(['count'])

    # добавляем в датафрейм строчку с пересчитанными товарами в веса
    count_df['weight'] = func(count_df['count'] / count_df['count'].sum())
    # print(count_df.size)

    # перводим items в датафрейм (сохраняем порядок item'ов)
    items_df = pd.DataFrame(items, columns = ['item_id'])
    # print('items_df = ', items_df)

    # создаём датафрейм item - вероятность с сохранением порядка item'ов
    probability_df = pd.merge(items_df, count_df, on='item_id', how='left')

    # заполняем пустые item's нулями
    probability_df.fillna(value=0, inplace=True)
    # print('probability_df = ', probability_df, probability_df.size)

    # берём столбец вероятностей и переводим его в numpy-array
    p = np.array(probability_df.to_numpy()[:, 2])

    # нормализация вероятностей для того чтобы они  были = 1
    p /= p.sum()  # normalize
    
    return items, p

Предсказания

In [39]:
%%time

# ВЕСА - ЛОГАРИФМ от количества покупок
items, p = data_prep(data_train, np.log)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted log rand rec'] = \
        result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(3)

Wall time: 3.69 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[958826, 824399, 98098, 15783020, 15660208]","[9525357, 1253602, 946707, 878651, 6537551]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13512876, 713618, 944068, 15926456, 13215961]","[6961668, 2846160, 5567328, 66469, 1051542]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1669677, 1119563, 877368, 851150, 15630089]","[904943, 6555362, 956294, 2022364, 822818]"


In [40]:
%%time

# ВЕСА - КВАДРАТНЫЙ КОРЕНЬ от количества покупок
items, p = data_prep(data_train, np.sqrt)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted sqrt rand rec'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

Wall time: 3.35 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[958826, 824399, 98098, 15783020, 15660208]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13512876, 713618, 944068, 15926456, 13215961]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]"


In [41]:
%%time

# your_code

# ВЕСА - КВАДРАТ от количества покупок
items, p = data_prep(data_train, np.square)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted square rand rec'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

Wall time: 4.16 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[958826, 824399, 98098, 15783020, 15660208]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13512876, 713618, 944068, 15926456, 13215961]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]"


## Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма (с вебинара и weighted_random_recommendation) с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество? Почему?

Добавим расчёты по алгоритмам из вебинара

In [42]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [43]:
%%time

items = data_train.item_id.unique()

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['rand rec'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(3)

Wall time: 4.31 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]"


In [44]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [45]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result['popular rec'] = result['user_id'].apply(lambda x: popular_recs)
result.head(3)

Wall time: 143 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]"


### Item-Item Recommender / ItemKNN

In [46]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [47]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [48]:
# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

# создаём сводную таблицу: в строках - 'user_id', в столбцах - 'item_id'.
# и считаем количество значения 'quantity'
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2498,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# всё что больше нуля делаем единицей
user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix. sparse matrix позволяют более эффективно хоранить разреженные данные
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Посчитаем плотность заполненных ячеек до:

In [50]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [51]:
interactions / (users*items)

0.010770291654185115

Посчитаем плотность заполненных ячеек после:

In [52]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

Она увеличилась в 5 раз: был 1% заполненных ячеек, стало 5%.

Посмотрим, какими числами заданы товары. Они довольно большие.

In [53]:
np.sort(data.item_id.unique())

array([   25671,    26081,    26093, ..., 18000012, 18024155, 18024556],
      dtype=int64)

In [54]:
# перенумеруем пользователей и товары, чтобы они задавались числами от 0 до номера количества товаров.
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [55]:
# посмотрим то, как мы переименовали item_id в id
itemid_to_id

{202291: 0,
 397896: 1,
 420647: 2,
 480014: 3,
 545926: 4,
 707683: 5,
 731106: 6,
 818980: 7,
 819063: 8,
 819227: 9,
 819255: 10,
 819304: 11,
 819308: 12,
 819330: 13,
 819518: 14,
 819594: 15,
 819643: 16,
 819765: 17,
 819840: 18,
 819845: 19,
 819927: 20,
 819978: 21,
 820082: 22,
 820122: 23,
 820165: 24,
 820291: 25,
 820301: 26,
 820321: 27,
 820361: 28,
 820486: 29,
 820518: 30,
 820560: 31,
 820701: 32,
 820895: 33,
 821025: 34,
 821083: 35,
 821200: 36,
 821209: 37,
 821219: 38,
 821344: 39,
 821464: 40,
 821556: 41,
 821562: 42,
 821695: 43,
 821730: 44,
 821735: 45,
 821787: 46,
 821867: 47,
 821976: 48,
 822049: 49,
 822073: 50,
 822101: 51,
 822140: 52,
 822178: 53,
 822225: 54,
 822241: 55,
 822339: 56,
 822346: 57,
 822407: 58,
 822517: 59,
 822524: 60,
 822646: 61,
 822677: 62,
 822739: 63,
 822785: 64,
 822936: 65,
 822965: 66,
 823099: 67,
 823176: 68,
 823356: 69,
 823704: 70,
 823721: 71,
 823758: 72,
 823775: 73,
 823862: 74,
 823915: 75,
 823990: 76,
 824005: 

Обучаем модель и получаем рекомендации

In [56]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей и количество потоков (ядер процессора)

## item-item recommender ожидает, что мы подадим в столбцах - user, в строках item. Поэтому транспонируем
model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, # не рекомендуем то, что он уже покупал
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 1.02 s


Рассчитываем для каждого пользователя его рекомендации и добавляем к итоговой таблице сравнений метрик.

In [57]:
%%time

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 101 ms


In [58]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"


### Косинусное сходство и CosineRecommender

In [59]:
%%time

model = CosineRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 1.23 s


In [60]:
%%time

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 97.7 ms


In [61]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]"


### TF-IDF взвешивание и TFIDFRecommender

In [62]:
%%time

model = TFIDFRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 1.22 s


In [63]:
%%time

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=False)])

Wall time: 115 ms


In [64]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]"


### Оценки моделей по precision@5

In [65]:
for column in result.columns[2:]:
    print(column, round(result.apply(lambda x: precision_at_k(x[column], x['actual'],  5), axis=1).mean(), 5))

rand rec 0.00088
weighted log rand rec 0.0002
weighted sqrt rand rec 0.00362
weighted square rand rec 0.1285
popular rec 0.15524
itemitem 0.13692
cosine 0.13291
tfidf 0.13898


## Задание 3. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов


**Попробуем улучишь precision, считая их на top 5000**

In [66]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,202291,35911
1,397896,1214994
2,420647,168661
3,480014,371107
4,545926,20134


In [67]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [68]:
%%time

items = top_5000

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['rand rec top 5000'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(2)

Wall time: 855 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,rand rec top 5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1098844, 921406, 847790, 970202, 977873]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[9547021, 1048462, 1018631, 882288, 1137680]"


In [69]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = top_5000[:5]

result['popular rec top 5000'] = result['user_id'].apply(lambda x: popular_recs)
result.head(3)

Wall time: 2 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,rand rec top 5000,popular rec top 5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1098844, 921406, 847790, 970202, 977873]","[6534178, 6533889, 6534166, 6544236, 1404121]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[9547021, 1048462, 1018631, 882288, 1137680]","[6534178, 6533889, 6534166, 6544236, 1404121]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[6514160, 1076608, 870608, 6034966, 962671]","[6534178, 6533889, 6534166, 6544236, 1404121]"


In [70]:
# функция для генерации items и массива вероятностей
def data_prep_top_5000(df, func):
    # создаём список уникальных товаров
    items = df.item_id.unique()
    # print('items 1 = ', items, items.size)

    # переводим в np массив
    items = np.array(items)[:5000]           # < == фильтруем 5000

    # создаём датафрейм с количеством каждого товара
    count_df = df.groupby('item_id').item_id.agg(['count'])

    # добавляем в датафрейм строчку с пересчитанными товарами в веса
    count_df['weight'] = func(count_df['count'] / count_df['count'].sum())
    # print(count_df.size)

    # перводим items в датафрейм (сохраняем порядок item'ов)
    items_df = pd.DataFrame(items, columns = ['item_id'])
    # print('items_df = ', items_df)

    # создаём датафрейм item - вероятность с сохранением порядка item'ов
    probability_df = pd.merge(items_df, count_df, on='item_id', how='left').head(5000) # <= фильтруем 5000

    # заполняем пустые item's нулями
    probability_df.fillna(value=0, inplace=True)
    # print('probability_df = ', probability_df, probability_df.size)

    # берём столбец вероятностей и переводим его в numpy-array
    p = np.array(probability_df.to_numpy()[:, 2])

    # нормализация вероятностей для того чтобы они  были = 1
    p /= p.sum()  # normalize
    
    return items, p

In [71]:
%%time

# ВЕСА - ЛОГАРИФМ от количества покупок
items, p = data_prep_top_5000(data_train, np.log)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted log rand rec top 5000'] = \
        result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(3)

Wall time: 339 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,rand rec top 5000,popular rec top 5000,weighted log rand rec top 5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1098844, 921406, 847790, 970202, 977873]","[6534178, 6533889, 6534166, 6544236, 1404121]","[895680, 1121367, 894968, 1005609, 947865]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[9547021, 1048462, 1018631, 882288, 1137680]","[6534178, 6533889, 6534166, 6544236, 1404121]","[889511, 8090550, 912902, 995628, 995965]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[6514160, 1076608, 870608, 6034966, 962671]","[6534178, 6533889, 6534166, 6544236, 1404121]","[833940, 950894, 823356, 1012913, 967065]"


In [72]:
%%time

# ВЕСА - КВАДРАТНЫЙ КОРЕНЬ от количества покупок
items, p = data_prep_top_5000(data_train, np.sqrt)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted sqrt rand rec top 5000'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

Wall time: 323 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,rand rec top 5000,popular rec top 5000,weighted log rand rec top 5000,weighted sqrt rand rec top 5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1098844, 921406, 847790, 970202, 977873]","[6534178, 6533889, 6534166, 6544236, 1404121]","[895680, 1121367, 894968, 1005609, 947865]","[1059236, 1118598, 893651, 951412, 864964]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[9547021, 1048462, 1018631, 882288, 1137680]","[6534178, 6533889, 6534166, 6544236, 1404121]","[889511, 8090550, 912902, 995628, 995965]","[845814, 836286, 6979160, 1006555, 973630]"


In [73]:
%%time

# your_code

# ВЕСА - КВАДРАТ от количества покупок
items, p = data_prep_top_5000(data_train, np.square)

# создаём столбец рекомендаций: для каждой строчки user_id создаём случайные рекомендации
result['weighted square rand rec top 5000'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

Wall time: 439 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,rand rec top 5000,popular rec top 5000,weighted log rand rec top 5000,weighted sqrt rand rec top 5000,weighted square rand rec top 5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[1098844, 921406, 847790, 970202, 977873]","[6534178, 6533889, 6534166, 6544236, 1404121]","[895680, 1121367, 894968, 1005609, 947865]","[1059236, 1118598, 893651, 951412, 864964]","[999999, 981760, 893018, 844179, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[9547021, 1048462, 1018631, 882288, 1137680]","[6534178, 6533889, 6534166, 6544236, 1404121]","[889511, 8090550, 912902, 995628, 995965]","[845814, 836286, 6979160, 1006555, 973630]","[999999, 1029743, 883404, 1082185, 1127831]"


In [74]:
for column in result.columns[2:]:
    print(column, round(result.apply(lambda x: precision_at_k(x[column], x['actual'],  5), axis=1).mean(), 5))

rand rec 0.00088
weighted log rand rec 0.0002
weighted sqrt rand rec 0.00362
weighted square rand rec 0.1285
popular rec 0.15524
itemitem 0.13692
cosine 0.13291
tfidf 0.13898
rand rec top 5000 0.00676
popular rec top 5000 0.04613
weighted log rand rec top 5000 0.00529
weighted sqrt rand rec top 5000 0.01166
weighted square rand rec top 5000 0.10754


**Считая бейзлайны на 5000 товарах, метрики улучшились**

**Попробуем улучшить разные варианты ItemItemRecommender, выбирая число соседей  𝐾.**

In [75]:
%%time

for K in tqdm(range(1, 11)):
    model = ItemItemRecommender(K=K, num_threads=8) # K - кол-во билжайших соседей и количество потоков (ядер процессора)

    ## item-item recommender ожидает, что мы подадим в столбцах - user, в строках item. Поэтому транспонируем
    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=False)

    recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, # не рекомендуем то, что он уже покупал
                        filter_items=None, 
                        recalculate_user=True)
    
    result[f'itemitem K={K}'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.07it/s]

Wall time: 9.33 s





In [76]:
for column in result.columns[2:]:
    print(column, round(result.apply(lambda x: precision_at_k(x[column], x['actual'],  5), axis=1).mean(), 5))

rand rec 0.00088
weighted log rand rec 0.0002
weighted sqrt rand rec 0.00362
weighted square rand rec 0.1285
popular rec 0.15524
itemitem 0.13692
cosine 0.13291
tfidf 0.13898
rand rec top 5000 0.00676
popular rec top 5000 0.04613
weighted log rand rec top 5000 0.00529
weighted sqrt rand rec top 5000 0.01166
weighted square rand rec top 5000 0.10754
itemitem K=1 0.17999
itemitem K=2 0.19201
itemitem K=3 0.18609
itemitem K=4 0.14496
itemitem K=5 0.13692
itemitem K=6 0.14202
itemitem K=7 0.14486
itemitem K=8 0.14721
itemitem K=9 0.14848
itemitem K=10 0.15093


**Видим, что лучшая метрика при K=2.**

Далее идут попытки выполнить остальные пункты ДЗ.

In [77]:
recs

[(2381, 78679.0),
 (3408, 72173.0),
 (2148, 60376.0),
 (3587, 46199.0),
 (300, 42769.0)]

In [78]:
result.head(2)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,...,itemitem K=1,itemitem K=2,itemitem K=3,itemitem K=4,itemitem K=5,itemitem K=6,itemitem K=7,itemitem K=8,itemitem K=9,itemitem K=10
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]",...,"[999999, 1082185, 1029743, 995785, 1004906]","[999999, 1082185, 995242, 1029743, 840361]","[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 995242, 1127831]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]",...,"[999999, 1082185, 1098066, 6534178, 1127831]","[999999, 1082185, 1098066, 6534178, 826249]","[999999, 1082185, 981760, 1098066, 6534178]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]"


In [116]:
result.drop(['test'], axis=1)
result.head(5)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf,...,itemitem K=2,itemitem K=3,itemitem K=4,itemitem K=5,itemitem K=6,itemitem K=7,itemitem K=8,itemitem K=9,itemitem K=10,test
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1001618, 9298551, 107051, 9878915, 1092149]","[9525357, 1253602, 946707, 878651, 6537551]","[1097957, 1119942, 1063594, 1044877, 1075283]","[12731808, 1082185, 6534178, 859075, 6533765]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]",...,"[999999, 1082185, 995242, 1029743, 840361]","[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 995242, 1127831]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[840839, 12187908, 865018, 1086629, 1068325]","[6961668, 2846160, 5567328, 66469, 1051542]","[856716, 106963, 1013321, 941769, 611498]","[965842, 1029743, 1082185, 826249, 845193]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]",...,"[999999, 1082185, 1098066, 6534178, 826249]","[999999, 1082185, 981760, 1098066, 6534178]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[999999, 1082185, 981760, 1098066, 826249]","[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[35511, 984861, 8068454, 17169697, 1318175]","[904943, 6555362, 956294, 2022364, 822818]","[908368, 920755, 901067, 1130029, 1224424]","[879755, 1053690, 1106523, 1082185, 1004906]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]",...,"[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 995242, 1098066]","[999999, 1082185, 981760, 995242, 1098066]","[999999, 1082185, 981760, 995242, 840361]","[999999, 1082185, 981760, 995242, 840361]","[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[2612148, 7177159, 1383670, 1117401, 1372140]","[944169, 1148035, 13115533, 1888133, 937095]","[6423998, 1035390, 1011303, 940700, 969932]","[1029743, 1022254, 1082185, 833025, 1053690]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]",...,"[999999, 1082185, 995242, 1029743, 826249]","[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 995242, 1098066]","[999999, 1082185, 981760, 995242, 1098066]","[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12949813, 852863, 920920, 9526839, 995188]","[15716785, 5980969, 975029, 9491531, 838258]","[9527161, 843013, 1051211, 913785, 995311]","[859075, 1029743, 1082185, 928049, 6534178]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]",...,"[999999, 1082185, 981760, 995242, 1029743]","[999999, 1082185, 981760, 6534178, 995242]","[999999, 1082185, 981760, 1127831, 995242]","[999999, 1082185, 981760, 1127831, 1098066]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[999999, 1082185, 981760, 1098066, 995242]","[835098, 872137, 910439, 924610, 992977, 10412..."


In [None]:
for column in result.columns[2:]:
    result['test'] += result[column]

result.head(5)

In [143]:
def n_most_freq_elem(arr, n):
 
    um = {}
    for i in range(len(arr)):
        if arr[i] in um:
            um[arr[i]] += 1
        else:
            um[arr[i]] = 1
    a = [0] * (len(um))
    j = 0
    for i in um:
        a[j] = [i, um[i]]
        j += 1
    a = sorted(a, key=lambda x: x[0],
               reverse=True)
    a = sorted(a, key=lambda x: x[1],
               reverse=True)
 
    # display the top k numbers
    print(n, "numbers with most occurrences are:")
    
    elems = []
    for i in range(n):
        print(a[i][0], end=" ")
        elems.append(a[i][0])
    #print('elems=', elems)
    return elems
        
    
list_1 = [1,1,1,1,1,2,2,2,3,3,3,4,4,5]
n_most_freq_elem(list_1, 3)

3 numbers with most occurrences are:
1 3 2 

[1, 3, 2]

In [None]:
result['stack'] = result['test'].apply(lambda x: n_most_freq_elem(result['test'], 5))
reault.head(5)

In [None]:
result['all items'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

In [None]:
for column in result.columns[2:]:

    result['voting'] = result[column].append(column)

result.head(2)

In [None]:
result['weighted square rand rec top 5000'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

In [None]:
result.head(2)

## Задание 4. Улучшение детерминированных алгоритмов
На семинаре мы рассматривали 



Далее $U \equiv N_i(u) $

$$r_{u,i} =  \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)r_{v, i}$$
$$ S = \sum\limits_{v \in U} \operatorname{sim}(u,v)$$

Предлагается улучшить эту формулу и учесть средние предпочтения всех пользователей

$$r_{u,i} = \mu + \bar{r_u} + \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)(r_{v, i}-\bar{r_{v}} - \mu)$$

Какие смысл имееют $ \mu $ и $ \bar{r_u}$ ?

Реализуйте алгоритм, прогнозирующий рейтинги на основе данной формулы, на numpy (векторизованно!)

В качестве схожести возьмите CosineSimilarity.

Примените к user_item_matrix. В качестве рейтингов возьмите количество или стоимость купленного товара. 
Данный алгоритм предсказывает рейтинги. Как на основании предсказанных рейтингов предсказать факт покупки?

Предложите вариант.
Посчитайте accuracy@5 и сравните с алгоритмами, разобранными на вебинаре.