<a href="https://colab.research.google.com/github/ErmakovAV/rec_sys/blob/main/hw_webinar_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Вебинар 2. Бейзлайны и детерминированные алгоритмы item-item

![recsys_types.png](attachment:recsys_types.png)

In [None]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


[Implicit](https://implicit.readthedocs.io/en/latest/quickstart.html) - очень быстрая и 
эффективная библиотека для рекоммендаций

Основные фичи:
    - Cython под капотом - высокая скорость
    - Множество приближенных алгоритмов - быстрее, чем оригинальные
    - Содежрит большинство популярных алгоритмов
    - Есть алгоритмы ранжирования
    - Поиск похожих товаров / юзеров
    - Есть возможность пересчета "холодного" юзера "на лету"
    - Возможность фильтровать товары при рекомендациях (Например, не рекомендовать женские товары мужчинам)
    - Есть метрики качества

In [None]:
import implicit



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [None]:
def str_list_to_int_list(row):
    "Конвертация str значений строк в int"
    list_int = []
    for i in row.strip('][').split():
        list_int.append(int(i.rstrip(',')))
    return list_int

In [None]:
data = pd.read_csv('/content/retail_train.csv')

In [None]:
data.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [None]:
data['week_no'].nunique()

95

In [None]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [None]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115772.0,83.458481
std,5178973.0,1628.715079
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553042.0,46.105
max,18024560.0,467993.62


In [None]:
popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
popularity.describe()

Unnamed: 0,item_id,user_id
count,89051.0,89051.0
mean,5115772.0,14.759767
std,5178973.0,45.904111
min,25671.0,1.0
25%,966583.0,1.0
50%,1448516.0,2.0
75%,9553042.0,10.0
max,18024560.0,2039.0


**Note:**  
Еще есть данные по характеристикам товаров и пользователей. Они нам пригодятся через несколько вебинаров

In [None]:
item_features = pd.read_csv('/content/product.csv')
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features = pd.read_csv('/content/hh_demographic.csv')
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


### Train-test split

В рекомендательных системах корректнее использовать train-test split по времени, а не случайно  
Я возьму последние 3 недели в качестве теста

In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [None]:
data_train.shape[0], data_test.shape[0]

(2278490, 118314)

# 1. Бейзлайны

Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### 1.1 Random recommendation

In [None]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [None]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(2)

CPU times: user 3.52 s, sys: 10.3 ms, total: 3.53 s
Wall time: 3.54 s


Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1008536, 572651, 835294, 1196505, 1131176]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[16058571, 946094, 1009171, 6979655, 870412]"


### 1.2 Popularity-based recommendation

In [None]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [None]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

CPU times: user 176 ms, sys: 55.9 ms, total: 232 ms
Wall time: 242 ms


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1008536, 572651, 835294, 1196505, 1131176]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[16058571, 946094, 1009171, 6979655, 870412]","[6534178, 6533889, 1029743, 6534166, 1082185]"


### 1.3 Weighted random recommender

Напишите код для случайных рекоммендаций, в которых вероятность рекоммендовать товар прямо пропорциональна логарифму продаж

- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

*Пример*  
item_1 - 5, item_2 - 7, item_3 - 4  # / sum  
item_1 - 5 / 16, item_2 - 7 / 16, item_3 - 4 / 16

In [None]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    
    items = np.array(items_weights['item_id'])
    recs = np.random.choice(items, size=n, replace=False, p=items_weights['weight'].tolist())
    

    return recs.tolist()

In [None]:
%%time

# your_code

items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
# берем где продажи > 0
popularity_positive = items_weights[items_weights['sales_value'] > 0]
# берем логариф продаж
items_weights.loc[items_weights['sales_value'] > 0, 'sales_sum_log'] = \
    np.log(popularity_positive['sales_value'])

# сделаем нормализацию 0-1
items_weights['sales_sum_log_norm'] = ((items_weights['sales_sum_log']-items_weights['sales_sum_log'].min())/\
                                           (items_weights['sales_sum_log'].max()-items_weights['sales_sum_log'].min()))

# сделаем переведем в вероятность 0-1
items_weights['weight'] = items_weights['sales_sum_log_norm'] / items_weights['sales_sum_log_norm'].sum()

# избавимся от NAN
items_weights.loc[np.isnan(items_weights['weight']), 'weight'] = 0

items_weights.drop('sales_value', axis=1, inplace=True)
items_weights.drop('sales_sum_log', axis=1, inplace=True)
items_weights.drop('sales_sum_log_norm', axis=1, inplace=True)


popular_recs = weighted_random_recommendation(items_weights, n=5)

popular_recs

CPU times: user 148 ms, sys: 75.7 ms, total: 224 ms
Wall time: 229 ms


[13512678, 857465, 1108657, 989935, 9526724]

### Выводы по бейзлайнам
- Фиксируют базовое качество;
- Бейзлайны могут быть фильтрами;
- Иногда бейзлайны лучше ML-модели

# 2. Детерминированные алгоритмы item-item

## 2.1 Item-Item Recommender / ItemKNN

![user_item_matrix.png](attachment:user_item_matrix.png)

То, что именно находится в матрице user-item нужно определять из бизнес-логики

Варианты для нашего датасета(не исчерпывающий список):
    - Факт покупки (0 / 1)
    - Кол-во покупок (count)
    - Сумма покупки, руб
    - ...
    
**Детерминированные алгоритмы**:
    - Предсказывают те числа, которые стоят в матрице

**ML-алгоритмы (большинство)**:
    - В качестве *таргетов* "под капотом" принимают 0 и 1 (в ячейке не 0 -> таргет 1)
    - А абсолютные значения воспринимают как *веса ошибок*
    
*P.S.* На самом деле есть много трюков, как можно заполнять матрицу user-item. Об этом мы поговорим на следующих вебинарах

**Как работает Item-Item Recommender**

![item_item_recommender.png](attachment:item_item_recommender.png)

*Шаг 1:* Ищем K ближайших юзеров к целевому юзеру  
*Шаг 2*: predict "скора" товара = среднему "скору" этого товара у его соседей  
*Шаг 3*: Сортируем товары по убыванию predict-ов и берем топ-k

----
**(!) Важно** 
- У item-item алгоритмов большая сложность predict ($O(I^2 log(I))$ или $O(I^3)$, в зависимости от реализации 
- Если в датасете много item_id, то item-item модели ОЧЕНЬ долго предсказывают. Со всеми товарами predict на тесте ~2 часа
- Давайте возьмем из ~90к товаров только 5k самых популярных 

*P.S.*  Брать топ-Х популярных и рекомендовать только из них - очень популярная стратегия.   
*P.P.S.*  В рекомендательных системах много таких трюков. Что-то подобное в курсе вы увидите еще не раз

In [None]:
# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [None]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [None]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [None]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
sparse_user_item

<2499x5001 sparse matrix of type '<class 'numpy.float64'>'
	with 667080 stored elements in Compressed Sparse Row format>

In [None]:
user_item_matrix.shape

(2499, 5001)

In [None]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

Для работы с мвтрицами заведём словари перевода идентификаторов в индексы матриц и обратно.

In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=sparse_user_item,   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 1.75 s, sys: 14.6 ms, total: 1.76 s
Wall time: 2.03 s


In [None]:
recs

(array([3408, 2148, 3947, 2307, 3587], dtype=int32),
 array([56269., 43806., 17219., 12981.,  7431.]))

In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 1127831, 995242, 1098066]

In [None]:
%%time

result['itemitem'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 147 ms, sys: 330 µs, total: 147 ms
Wall time: 238 ms


In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003818, 10204680, 1013434, 1112139, 1123692]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12330346, 852651, 13008044, 8181078, 1059976]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]"


Попробуем воспроизвести этот алгоритм вручную:

In [None]:
similarity = (sparse_user_item.T@sparse_user_item).toarray()
for ind, row in enumerate(similarity):
  row[ind] = 0
  best = row.argsort()[::-1][:5]
  mask = np.ones(row.shape, bool)
  mask[best] = False
  row[mask] = 0


similarity = csr_matrix(similarity)

In [None]:
# ii_res = (sparse_user_item[userid_to_id[2]]@similarity).toarrey().ravel()
# [ii_res[rec] for rec in ii_res.argsort()[::-1][:7]]

In [None]:
# [id_to_itemid[rec] for rec in ii_res.argsort()[::-1][:7]]

### 4.2 Косинусное сходство и CosineRecommender

![cosine_similarity.png](attachment:cosine_similarity.png)

In [None]:
%%time

model = CosineRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items = sparse_user_item,   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)



  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 2.06 s, sys: 0 ns, total: 2.06 s
Wall time: 1.48 s


In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 1127831, 1098066, 961554]

In [None]:
%%time

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items = [itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

CPU times: user 94.7 ms, sys: 0 ns, total: 94.7 ms
Wall time: 99.5 ms


In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003818, 10204680, 1013434, 1112139, 1123692]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12330346, 852651, 13008044, 8181078, 1059976]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]"


### 4.3 TF-IDF взвешивание и TFIDFRecommender

![tf_idf.png](attachment:tf_idf.png)

Если 2 юзера оба купили очень популярный товар, то это еще не значит,что они похожи   
Если 2 юзера оба купили редкий товар, то они похожи

Занижаем вес популярных товаров при расчете расстояний между пользователями

In [None]:
%%time

model = TFIDFRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items = sparse_user_item,   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)



  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 2.07 s, sys: 0 ns, total: 2.07 s
Wall time: 1.43 s


In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 1127831, 1098066, 961554]

In [None]:
%%time

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

CPU times: user 88.8 ms, sys: 0 ns, total: 88.8 ms
Wall time: 89.9 ms


In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003818, 10204680, 1013434, 1112139, 1123692]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12330346, 852651, 13008044, 8181078, 1059976]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]"


### 4.4 Трюк, поменяем немного гиперпараметры

In [None]:
%%time

model = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items = sparse_user_item,   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 2.06 s, sys: 13.1 ms, total: 2.08 s
Wall time: 1.47 s


In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 1029743, 995785, 1004906, 1081177]

In [None]:
recs

(array([3408, 2757, 2316, 2434, 3397], dtype=int32),
 array([2233., 1317., 1170., 1098., 1048.]))

In [None]:
%%time

result['own_purchases'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]],
                                    recalculate_user=False)[0]])

CPU times: user 75.3 ms, sys: 0 ns, total: 75.3 ms
Wall time: 76 ms


### 4.5 Измерим качество по precision@5

In [None]:
result.head(10)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003818, 10204680, 1013434, 1112139, 1123692]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12330346, 852651, 13008044, 8181078, 1059976]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[822283, 1197431, 10456166, 10198378, 1090218]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6608281, 897298, 863545, 15595911, 267498]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1112860, 15830936, 13190557, 13911394, 1055915]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8...","[1044668, 7441863, 1038011, 1865362, 12302114]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884...","[10204557, 914128, 2268605, 965666, 999668]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
7,14,"[840601, 867293, 933067, 951590, 952408, 96569...","[1469767, 12301404, 9884417, 843730, 973311]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13...","[963903, 6545152, 15716378, 6534578, 13190356]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
9,16,"[1062973, 1082185, 13007710]","[322114, 923080, 5579587, 9796434, 983226]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"


In [None]:
result.to_csv('/content/predictions_basic.csv', index=False)

Можно ли улучшить бейзлайны, если считать их на топ-5000 товарах?

In [None]:
# your_code
def precision_at_k(recommended_list, bought_list, k=5):

  bought_list = np.array(bought_list)
  recommended_list = np.array(recommended_list)

  bought_list = bought_list               # здесь нет k
  recommended_list = recommended_list[:k]

  flags = np.isin(bought_list, recommended_list)
  precision = flags.sum() / len(recommended_list)

  return precision

In [None]:
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

0.00019588638589618023

In [None]:
result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

0.15523996082272282

In [None]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()

0.14573947110675808

In [None]:
result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean()

0.13545543584720862

In [None]:
result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean()

0.13545543584720862

In [None]:
result.apply(lambda row: precision_at_k(row['own_purchases'], row['actual']), axis=1).mean()

0.16229187071498533

##Задание 1. 

Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж

Можно сэмплировать товары случайно, но пропорционально какому-либо весу
Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [None]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code

    items = np.array(items_weights['item_id'])
    recs = np.random.choice(items, size=n, replace=False, p=items_weights['weight'].tolist())
    
    return recs.tolist()

In [None]:
%%time

# your_code

items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
# берем где продажи > 0
popularity_positive = items_weights[items_weights['sales_value'] > 0]
# берем логариф продаж
items_weights.loc[items_weights['sales_value'] > 0, 'sales_sum_log'] = \
    np.log(popularity_positive['sales_value'])

# сделаем нормализацию 0-1
items_weights['sales_sum_log_norm'] = ((items_weights['sales_sum_log']-items_weights['sales_sum_log'].min())/\
                                           (items_weights['sales_sum_log'].max()-items_weights['sales_sum_log'].min()))

# сделаем переведем в вероятность 0-1
items_weights['weight'] = items_weights['sales_sum_log_norm'] / items_weights['sales_sum_log_norm'].sum()

# избавимся от NAN
items_weights.loc[np.isnan(items_weights['weight']), 'weight'] = 0

items_weights.drop('sales_value', axis=1, inplace=True)
items_weights.drop('sales_sum_log', axis=1, inplace=True)
items_weights.drop('sales_sum_log_norm', axis=1, inplace=True)


popular_recs = weighted_random_recommendation(items_weights, n=5)

popular_recs

CPU times: user 213 ms, sys: 65.8 ms, total: 278 ms
Wall time: 307 ms


[258910, 6424642, 933864, 1062850, 2323272]

In [None]:
result = pd.read_csv('/content/predictions_basic.csv')
for col in result.iloc[:,1:]:
    result[col] = result.apply(lambda row: str_list_to_int_list(row[col]), axis=1)
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [None]:
result['weighted_random_recommendation'] = result['user_id'].apply\
                                    (lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[90068, 12984698, 814108, 12187380, 13131505]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[1095847, 13911207, 509113, 9878726, 947318]"


##Задание 2.

Расчет метрик

Рассчитайте Precision@3 и Recall@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

Precision@3

In [None]:
# your_code
def precision_at_k(recommended_list, bought_list, k=3):

  bought_list = np.array(bought_list)
  recommended_list = np.array(recommended_list)

  bought_list = bought_list               # здесь нет k
  recommended_list = recommended_list[:k]

  flags = np.isin(bought_list, recommended_list)
  precision = flags.sum() / len(recommended_list)

  return precision

In [None]:
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

0.000326477309826967

In [None]:
result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

0.13777342474698007

In [None]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()

0.1740124061377734

In [None]:
result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean()

0.1740124061377734

In [None]:
result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean()

0.1740124061377734

In [None]:
result.apply(lambda row: precision_at_k(row['own_purchases'], row['actual']), axis=1).mean()

0.21906627489389485

Recall@5

In [None]:
def recall_at_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
  
    flags = np.isin(bought_list, recommended_list)
    recall_at_k =  flags.sum() / len(bought_list)
    
    return recall_at_k

In [None]:
result.apply(lambda row: recall_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

2.147877038335309e-05

In [None]:
result.apply(lambda row: recall_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

0.02499556380543445

In [None]:
result.apply(lambda row: recall_at_k(row['itemitem'], row['actual']), axis=1).mean()

0.016217944397612788

In [None]:
result.apply(lambda row: recall_at_k(row['cosine'], row['actual']), axis=1).mean()

0.014031051785152234

In [None]:
result.apply(lambda row: recall_at_k(row['tfidf'], row['actual']), axis=1).mean()

0.014031051785152234

In [None]:
result.apply(lambda row: recall_at_k(row['own_purchases'], row['actual']), axis=1).mean()

0.018309253102523456

In [None]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [None]:
for col in result.iloc[:,2:]:
    print(col +": ","{0:.4f}".format(result.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()))

random_recommendation:  0.0006
popular_recommendation:  0.1552
itemitem:  0.0336
cosine:  0.0353
tfidf:  0.0361
own_purchases:  0.1800
weighted_random_recommendation:  0.0009


In [None]:
def precision_at_k(recommended_list, bought_list, k=3):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [None]:
for col in result.iloc[:,2:]:
    print(col +": ","{0:.4f}".format(result.apply(lambda row: precision_at_k(row[col], row['actual'], k=3), axis=1).mean()))

random_recommendation:  0.0007
popular_recommendation:  0.1378
itemitem:  0.0335
cosine:  0.0353
tfidf:  0.0382
own_purchases:  0.1983
weighted_random_recommendation:  0.0007


In [None]:
def precision_at_k(recommended_list, bought_list, k=10):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [None]:
for col in result.iloc[:,2:]:
    print(col +": ","{0:.4f}".format(result.apply(lambda row: precision_at_k(row[col], row['actual'], k=10), axis=1).mean()))

random_recommendation:  0.0006
popular_recommendation:  0.1552
itemitem:  0.0336
cosine:  0.0353
tfidf:  0.0361
own_purchases:  0.1800
weighted_random_recommendation:  0.0009


##Задание 3*. 

Улучшение бейзлайнов и ItemItem

Попробуйте улучшить бейзлайны, считая их на топ-1000 товаров
Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей  K .

## result_1000

In [None]:
result_1000 = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_1000.columns=['user_id', 'actual']
result_1000.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
test_users = result_1000.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


In [None]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [None]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result_1000['popular_recommendation'] = result_1000['user_id'].apply(lambda x: popular_recs)
result_1000.head(2)

CPU times: user 178 ms, sys: 32.3 ms, total: 211 ms
Wall time: 220 ms


Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [None]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [None]:
top_1000 = popularity.sort_values('n_sold', ascending=False).head(1000).item_id.tolist()

In [None]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-1000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_1000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,819255,819518,819765,...,12302069,12384365,12384775,12648296,12810393,12811532,12946027,13115493,13511722,13841744
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
sparse_user_item

<2499x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 297802 stored elements in Compressed Sparse Row format>

In [None]:
user_item_matrix.shape

(2499, 1001)

In [None]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

11.90494179689858

Для работы с мвтрицами заведём словари перевода идентификаторов в индексы матриц и обратно.

In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=sparse_user_item,   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/1001 [00:00<?, ?it/s]

CPU times: user 264 ms, sys: 4.95 ms, total: 269 ms
Wall time: 233 ms


In [None]:
recs

(array([720, 468, 504, 817, 746], dtype=int32),
 array([41647., 33366., 11466., 11162.,  4982.]))

In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 995242, 1127831, 1098066]

In [None]:
%%time

result_1000['itemitem'] = result_1000['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 74.3 ms, sys: 0 ns, total: 74.3 ms
Wall time: 77.9 ms


In [None]:
result_1000.head(5)

Unnamed: 0,user_id,actual,popular_recommendation,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1127831, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1127831, 1098066]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1127831, 1098066]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1127831, 1098066]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1127831, 1098066]"


## k = 3, 10

In [None]:
%%time

model = ItemItemRecommender(K=3, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=sparse_user_item,   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/1001 [00:00<?, ?it/s]

CPU times: user 257 ms, sys: 3.65 ms, total: 261 ms
Wall time: 195 ms


In [None]:
recs

(array([720, 468, 504, 590,  66], dtype=int32),
 array([41647.,  2953.,  1402.,  1317.,  1284.]))

In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 995242, 1029743, 840361]

In [None]:
%%time

result_1000['itemitem'] = result_1000['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 71.3 ms, sys: 0 ns, total: 71.3 ms
Wall time: 72.1 ms


In [None]:
result_1000.head(5)

Unnamed: 0,user_id,actual,popular_recommendation,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 840361]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 840361]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 840361]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 840361]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 840361]"


In [None]:
%%time

model = ItemItemRecommender(K=10, num_threads=4) # K - кол-во билжайших соседей

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=sparse_user_item,   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/1001 [00:00<?, ?it/s]

CPU times: user 258 ms, sys: 3.31 ms, total: 261 ms
Wall time: 197 ms


In [None]:
recs

(array([720, 468, 504,  66, 817], dtype=int32),
 array([41647., 34332., 28448., 27274., 24849.]))

In [None]:
[id_to_itemid[rec] for rec in recs[0]]

[1082185, 981760, 995242, 840361, 1127831]

In [None]:
%%time

result_1000['itemitem'] = result_1000['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

CPU times: user 79.7 ms, sys: 834 µs, total: 80.6 ms
Wall time: 81.5 ms


In [None]:
result_1000.head(5)

Unnamed: 0,user_id,actual,popular_recommendation,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 840361, 1127831]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 840361, 1127831]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 840361, 1127831]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 840361, 1127831]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 840361, 1127831]"


In [None]:
for col in result.iloc[:,2:]:
    print(col +": ","{0:.4f}".format(result.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()))

random_recommendation:  0.0006
popular_recommendation:  0.1552
itemitem:  0.0336
cosine:  0.0353
tfidf:  0.0361
own_purchases:  0.1800
weighted_random_recommendation:  0.0009
