In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [123]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [124]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [125]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [126]:
pb = data_test.groupby('item_id')['sales_value'].sum().reset_index()
pb.columns=['item_id', 'sales']
r_sum = np.sum(pb['sales'])
pb['sales'] = pb['sales']/r_sum
pb.head(2)

Unnamed: 0,item_id,sales
0,29512,3e-06
1,30356,6e-06


In [127]:
np.sum(pb['sales'])

1.0000000000000002

In [128]:
result.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [129]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [130]:
def weighted_random_recommendation(pb, n=5):

    items = np.array(pb['item_id'])
    prob = np.array(pb['sales'])
    recs = np.random.choice(items, p = prob, size=n, replace=False)
    
    return recs.tolist()

In [133]:
%%time

items = data_train.item_id.unique()

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(pb, n=5))

result.head(5)

CPU times: user 1.04 s, sys: 0 ns, total: 1.04 s
Wall time: 1.04 s


Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[883963, 1081177, 1065538, 1012816, 916122]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13416351, 968992, 1117473, 1132204, 15452330]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[16223139, 936355, 982503, 7155012, 5995129]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[930118, 5568489, 12385916, 916122, 6534178]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[6534178, 847241, 1084203, 1025404, 851676]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [152]:
result = pd.read_csv('./predictions_basic.csv')
result.head(3)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem
0,1,[ 821867 834484 856942 865456 889248 ...,"[9655292, 1046873, 9884806, 1198103, 99401]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[7409670, 1817122, 962266, 1850638, 10312365]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[890184, 6533843, 867673, 1055708, 1151239]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"


In [159]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   user_id                 2042 non-null   int64 
 1   actual                  2042 non-null   object
 2   random_recommendation   2042 non-null   object
 3   popular_recommendation  2042 non-null   object
 4   itemitem                2042 non-null   object
dtypes: int64(1), object(4)
memory usage: 79.9+ KB


random_recommendation 	

In [218]:
items = result[['user_id','random_recommendation', 'actual']]

In [228]:
def precision(items, rtype):

    items['flags'] = np.isin(items['actual'], items[rtype])
    items['precision'] = items.flags.sum() / len(items[rtype])
    return items


def precision_at_k(items, k=5):
    items['flags'] = np.isin(items['actual'][:k], items['popular_recommendation'][:k])
    
    items['precision'] = np.dot(items.flags, items.actual).sum() / np.sum(items.actual)
    
    items['precision'] = items.flags.sum() / len(items.popular_recommendation)
    
    return items

In [241]:
items = precision(items, rtype='random_recommendation')

In [242]:
items['precision'].unique()

array([0.])

In [243]:
items_pr = result[['user_id','popular_recommendation', 'actual']]

In [244]:
items_pr = precision(items_pr, rtype='popular_recommendation')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items['flags'] = np.isin(items['actual'], items[rtype])


In [240]:
items_pr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 2042 non-null   int64  
 1   popular_recommendation  2042 non-null   object 
 2   actual                  2042 non-null   object 
 3   flags                   2042 non-null   bool   
 4   precision               2042 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 65.9+ KB


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code