In [1]:
import pandas as pd
import numpy as np
from numpy import linalg
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

from sklearn.metrics.pairwise import cosine_similarity

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import mean_average_precision_at_k, AUC_at_k, ndcg_at_k

import re
import datetime

In [2]:
PATH_TO_RETAIL_TRAIN = '../урок 2/retail_train.csv'
data = pd.read_csv(PATH_TO_RETAIL_TRAIN)
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
def precision_at_k(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

### Задание 0. Товар 999999
На вебинаре мы использовали товар 999999 - что это за товар?  
Зачем он нужен?  
Используя этот товар мы смещаем качество рекомендаций.
В какую сторону?   
Можно ли удалить этот товар?   
Уберите этот товар и сравните с качеством на семинаре.

In [5]:
def top_k_items(dataset, feature, k):
    
    dataset = dataset.sort_values(feature, ascending=False).head(k).item_id.tolist()
    
    return dataset

In [6]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [7]:
top_5000 = top_k_items(popularity, 'n_sold', 5000)
df = data_train.copy()
df.loc[~df['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(df,
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity', 
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)

In [8]:
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
user_ids = user_item_matrix.index.values
item_ids = user_item_matrix.columns.values

matrix_user_ids = np.arange(len(user_ids))
matrix_item_ids = np.arange(len(item_ids))

id_to_user_id = dict(zip(matrix_user_ids, user_ids))
id_to_item_id = dict(zip(matrix_item_ids, item_ids))

user_id_to_id = dict(zip(user_ids, matrix_user_ids))
item_id_to_id = dict(zip(item_ids, matrix_item_ids))

In [10]:
compare_df = df.groupby('user_id')['item_id'].unique().reset_index()
compare_df.columns = ['user_id', 'actual']
compare_df['actual'] = compare_df['actual'].apply(lambda x: list(x))

In [56]:
compare_df

Unnamed: 0,user_id,actual,recommendation_with_999999
0,1,"[999999, 840361, 845307, 852014, 856942, 91267...","[129724, 396935, 113949, 491426, 123522]"
1,2,"[854852, 930118, 1077555, 1098066, 999999, 556...","[129724, 396935, 113949, 491426, 421024]"
2,3,"[866211, 878996, 882830, 904360, 921345, 99999...","[129724, 396935, 113949, 421024, 123522]"
3,4,"[999999, 857849, 883932, 891423, 897125, 90103...","[129724, 396935, 113949, 421024, 123522]"
4,5,"[999999, 889509, 937626, 941797, 1004596, 1114...","[129724, 396935, 113949, 123522, 421024]"
...,...,...,...
2494,2496,"[840361, 999999, 871756, 886703, 899624, 91612...","[129724, 396935, 113949, 421024, 123522]"
2495,2497,"[999999, 1037840, 5569230, 8090537, 1022428, 5...","[129724, 396935, 113949, 421024, 491426]"
2496,2498,"[824555, 835576, 901776, 904023, 911215, 91749...","[129724, 396935, 113949, 421024, 491426]"
2497,2499,"[838186, 999999, 864143, 932949, 933835, 10676...","[129724, 396935, 113949, 421024, 123522]"


In [58]:
def recommendation(model_of_rec_sys, df, k, feature_name, exit_col_name, u_i_matrix, u_id_to_id, id_to_i_id):
    model = model_of_rec_sys(K=k)
    model.fit(csr_matrix(u_i_matrix).T.tocsr(), show_progress=True)
    

    df[exit_col_name] = df[feature_name].apply(lambda x: [id_to_i_id[rec[0]] for rec in model.recommend(userid=u_id_to_id[x], 
                                                                        user_items=csr_matrix(u_i_matrix).tocsr(), 
                                                                        N=5, 
                                                                        filter_already_liked_items=False, 
                                                                        filter_items=None, 
                                                                        recalculate_user=True)])

In [13]:
recommendation(ItemItemRecommender, compare_df, 5, 'user_id', 'recommendation_with_999999', user_item_matrix, user_id_to_id, id_to_item_id)

  0%|          | 0/5001 [00:00<?, ?it/s]

In [25]:
df_second = data_train.copy()
df_second = df_second.loc[df_second['item_id'].isin(top_5000)]

user_item_matrix_without_99999 = pd.pivot_table(df_second,
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity', 
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix_without_99999[user_item_matrix_without_99999 > 0] = 1
user_item_matrix_without_99999 = user_item_matrix_without_99999.astype(float)

In [26]:
user_ids_without_99999 = user_item_matrix_without_99999.index.values
item_ids_without_99999 = user_item_matrix_without_99999.columns.values

matrix_user_ids_without_99999 = np.arange(len(user_ids_without_99999))
matrix_item_ids_without_99999 = np.arange(len(item_ids_without_99999))

id_to_user_id_without_99999 = dict(zip(matrix_user_ids_without_99999, user_ids_without_99999))
id_to_item_id_without_99999 = dict(zip(matrix_item_ids_without_99999, item_ids_without_99999))

user_id_to_id_without_99999 = dict(zip(user_ids_without_99999, matrix_user_ids_without_99999))
item_id_to_id_without_99999 = dict(zip(item_ids_without_99999, matrix_item_ids_without_99999))

In [28]:
recommendation(ItemItemRecommender, compare_df, 5, 'user_id', 'recommendation_without_999999', user_item_matrix_without_99999, user_id_to_id_without_99999, id_to_item_id_without_99999)

  0%|          | 0/5000 [00:00<?, ?it/s]

KeyError: 62

In [None]:
compare_df['precition_with_999999'] = compare_df.apply(lambda x: precision_at_k(x['recommendation_with_999999'], x['actual']), axis=1)
compare_df['precition_with_999999'].sum()/compare_df.shape[0]

In [None]:
compare_df['precition_without_999999'] = compare_df.apply(lambda x: precision_at_k(x['recommendation_without_999999'], x['actual']), axis=1)
compare_df['precition_without_999999'].sum()/compare_df.shape[0]

*Выше мы видим что убрав не популярные товары, мы так же потеряли некоторых пользователей, которые покупали только непопулятрные товары. По-этому целесеобразно не терять пользователей и заменять товары не вошедшие в популярные на один общий item_id*

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. вес = log(sales_sum товара)
- Придумайте пример 3 весов, посчитайте weighted_random_recommendation для разных весов

In [59]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    
    recs = np.random.choice(items_weights['item_id'].values, p=items_weights['weight'].values, size=n, replace=False)
    
    return recs.tolist()

In [60]:
popular_items = data_train.groupby('item_id')['quantity'].sum().reset_index()

popular_items['quantity'] = popular_items['quantity'].apply(lambda x: np.log(1 + x))
sales_values_sum = popular_items['quantity'].sum()
popular_items['quantity'] = popular_items['quantity'].apply(lambda x: x/sales_values_sum)

popular_items = popular_items.rename(columns={'quantity': 'weight'}, inplace=False)

popular_items.sum()

item_id    4.340828e+11
weight     1.000000e+00
dtype: float64

In [61]:
weighted_random_recommendation(popular_items)

[1119769, 6904798, 108404, 5984458, 979452]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма (с вебинара и weighted_random_recommendation) с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество? Почему?

In [62]:
PATH_TO_PREDS ='../урок 2/predictions.csv'
result = pd.read_csv(PATH_TO_PREDS)# закгрузка predict с семианара
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[12734433, 9371822, 1017539, 911036, 9493557]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5995494, 13876585, 1088112, 1027049, 1788883]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1614902, 1039131, 963011, 991542, 918875]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6919446, 1077409, 9527054, 2073510, 1098569]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1091981, 924475, 1151125, 5565356, 12484600]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]"


#### Пребразуем строку с item_id в список

In [63]:
# result['column_name'].map(lambda x: x[1:-1].split(', ')).apply(lambda x: list(map(int, x)))[0]

In [64]:
# result.apply(lambda x: precision_at_k(x['popular_recommendation'], x['actual_l'],  5), axis=1)

In [65]:
result['weighted_random_recommendation'] = \
 result['user_id'].apply(lambda x: weighted_random_recommendation(popular_items, n=5))

In [66]:
for column in result.select_dtypes(include='object'):

    result[column] = result[column].apply(lambda x: re.findall(r'\d+', x) if isinstance(x, str) else x)

In [67]:
results_columns = result.select_dtypes(include='object').drop(columns=['actual']).columns.to_list()

In [69]:
for column in results_columns:
    result[f'{column}_predicts'] = result.apply(lambda x: precision_at_k(x[column], x['actual']), axis=1)

  mask |= (ar1 == a)


In [73]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation,random_recommendation_predicts,popular_recommendation_predicts,itemitem_predicts,cosine_predicts,tfidf_predicts,own_purchases_predicts,weighted_random_recommendation_predicts
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[12734433, 9371822, 1017539, 911036, 9493557]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[915041, 9297062, 871353, 13213121, 13002981]",0.0,0.2,0.4,0.2,0.2,0.4,0.0
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5995494, 13876585, 1088112, 1027049, 1788883]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1033846, 5592629, 7442338, 8069056, 822129]",0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Задание 3. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов


#### Бейзлайны

In [74]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [75]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [76]:
result_ex_3 = data_train.groupby('user_id')['item_id'].unique().reset_index()
result_ex_3.columns = ['user_id', 'boughted']
result_ex_3['boughted'] = result_ex_3['boughted'].apply(lambda x: list(x))
result_ex_3.head()

Unnamed: 0,user_id,boughted
0,1,"[825123, 831447, 840361, 845307, 852014, 85498..."
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55..."
2,3,"[866211, 878996, 882830, 904360, 921345, 93194..."
3,4,"[836163, 857849, 877523, 878909, 883932, 89142..."
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69..."


In [79]:
%%time

result_ex_3['random_recommendation'] = result_ex_3['user_id'].apply(lambda x: random_recommendation(top_5000, n=5))

CPU times: user 1.35 s, sys: 12.9 ms, total: 1.37 s
Wall time: 1.37 s


In [80]:
data_train_top_5k = df.copy()

In [89]:
top_5k_df = data_train_top_5k.groupby('item_id').sum().reset_index()
top_5k_df = top_5k_df.loc[top_5k_df['item_id'].isin(top_5000)]
top_5k_df.head()

Unnamed: 0,item_id,user_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,202291,8972,128195649604,1066,35911,81.59,12649,-2.37,6125,155,0.0,0.0
1,397896,121367,3634768517712,39016,1214994,2862.41,411755,-41.26,161701,5642,0.0,0.0
2,420647,24815,523073269193,6875,168661,463.81,48541,-7.82,22765,991,0.0,0.0
3,480014,38817,1145894973275,13301,371107,863.41,197759,-18.51,50295,1925,0.0,0.0
4,545926,75,33215406569,422,20134,58.37,184,-0.6,1509,61,0.0,0.0


In [90]:
result_ex_3['popularity_recommendation'] = result_ex_3['user_id'].\
    apply(lambda x: popularity_recommendation(top_5k_df, n=5))
result_ex_3

Unnamed: 0,user_id,boughted,random_recommendation,popularity_recommendation
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[869195, 5568197, 1022843, 927681, 837677]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1124029, 891752, 909396, 8203834, 1013572]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2,3,"[866211, 878996, 882830, 904360, 921345, 93194...","[977166, 821787, 7025496, 835694, 1050310]","[6534178, 6533889, 1029743, 6534166, 1082185]"
3,4,"[836163, 857849, 877523, 878909, 883932, 89142...","[1110244, 1016800, 7433888, 1033846, 6602729]","[6534178, 6533889, 1029743, 6534166, 1082185]"
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69...","[850139, 1110843, 999581, 1097815, 1102185]","[6534178, 6533889, 1029743, 6534166, 1082185]"
...,...,...,...,...
2494,2496,"[840361, 852159, 871756, 886703, 899624, 91612...","[873902, 1051305, 932761, 6464208, 824311]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2495,2497,"[838220, 1037840, 1052294, 5569230, 8090537, 1...","[1046551, 871611, 1134398, 983659, 839346]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2496,2498,"[824555, 835576, 901776, 904023, 911215, 91749...","[1029504, 1063364, 1117393, 937614, 911017]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2497,2499,"[838186, 853197, 864143, 883665, 932949, 93383...","[947200, 880377, 1093403, 984677, 995134]","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [91]:
df_top_5k_weighted = data_train_top_5k.groupby('item_id')['quantity'].sum().reset_index()

In [93]:
result_ex_3['weighted_random_reccomendation'] = result_ex_3['user_id'].apply(lambda x: weighted_random_recommendation(popular_items))

In [94]:
result_ex_3.head()

Unnamed: 0,user_id,boughted,random_recommendation,popularity_recommendation,weighted_random_reccomendation
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[869195, 5568197, 1022843, 927681, 837677]","[6534178, 6533889, 1029743, 6534166, 1082185]","[954997, 905809, 12171180, 1090349, 2028029]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1124029, 891752, 909396, 8203834, 1013572]","[6534178, 6533889, 1029743, 6534166, 1082185]","[15479309, 972168, 886941, 958819, 833025]"
2,3,"[866211, 878996, 882830, 904360, 921345, 93194...","[977166, 821787, 7025496, 835694, 1050310]","[6534178, 6533889, 1029743, 6534166, 1082185]","[4767858, 64578, 1050543, 10121996, 15597019]"
3,4,"[836163, 857849, 877523, 878909, 883932, 89142...","[1110244, 1016800, 7433888, 1033846, 6602729]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1131171, 950289, 829278, 12430325, 8118618]"
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69...","[850139, 1110843, 999581, 1097815, 1102185]","[6534178, 6533889, 1029743, 6534166, 1082185]","[2584300, 1142721, 956893, 944526, 1013877]"


#### ItemItemRecommender@k

In [96]:
%%time
for i in range(1, 4):
    model = ItemItemRecommender(K=i)
    model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=True)
    
    result_ex_3[f'itemitem_{i}'] = result_ex_3['user_id'].apply(lambda x: [id_to_item_id[rec[0]] for rec in model.recommend(userid=user_id_to_id[x], 
                                                                user_items=csr_matrix(user_item_matrix).tocsr(), 
                                                                N=5, 
                                                                filter_already_liked_items=False, 
                                                                filter_items=None, 
                                                                recalculate_user=True)])

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: user 31min 38s, sys: 18.6 s, total: 31min 57s
Wall time: 32min 2s


In [98]:
for i in range(1, 4):
    result_ex_3[f'itemitem_{i}'] = result_ex_3.apply(lambda x: precision_at_k(x[f'itemitem_{i}'], x['boughted']), axis=1)

In [104]:
result_ex_3[['itemitem_1', 'itemitem_2', 'itemitem_3']].sum(axis=0)/result_ex_3.shape[0]

itemitem_1    0.037215
itemitem_2    0.042097
itemitem_3    0.040496
dtype: float64

### Задание 4. Улучшение детерминированных алгоритмов
На семинаре мы рассматривали 



Далее $U \equiv N_i(u) $

$$r_{u,i} =  \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)r_{v, i}$$
$$ S = \sum\limits_{v \in U} \operatorname{sim}(u,v)$$

Предлагается улучшить эту формулу и учесть средние предпочтения всех пользователей

$$r_{u,i} = \mu + \bar{r_u} + \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)(r_{v, i}-\bar{r_{v}} - \mu)$$

Какие смысл имееют $ \mu $ и $ \bar{r_u}$ ?

Реализуйте алгоритм, прогнозирующий рейтинги на основе данной формулы, на numpy (векторизованно!)

В качестве схожести возьмите CosineSimilarity.

Примените к user_item_matrix. В качестве рейтингов возьмите количество или стоимость купленного товара. 
Данный алгоритм предсказывает рейтинги. Как на основании предсказанных рейтингов предсказать факт покупки?

Предложите вариант.
Посчитайте accuracy@5 и сравните с алгоритмами, разобранными на вебинаре.

#### Скорее всего $\mu$ среднее по всем оценкам всех пользователей, а $\bar r_u$ это средняя оценка по всем товарам 

In [None]:
def cosine_similarity(u, v):
    
    return (u * v).sum() / (linalg.norm(u) * linalg.norm(v))

In [None]:
def similarity_matrix(df):
    matrix = pd.DataFrame(data=[], columns=df.index.values)
    
    for index, value in df.T.items():
        matrix[index] = df.apply(lambda x: cosine_similarity(value.values, x.values), axis=1)
        
    return matrix

In [None]:
def r_u_i(sim_matrix, r_u, r_u_i, mu):
    
    sim = sim_u_v(u, v) 
    
    r = mu + r_u + (sim * (r_u_i - r_u - mu)).sum()/sim.sum
    
    return r

In [None]:
%%time
similarity_matrix(user_item_matrix)

In [None]:
user_item_matrix.apply(lambda x: print(x), axis=0)

In [None]:
for i in user_item_matrix.items():
    print(i)