# **Задание**

Выполнить Weighted Random Recommendation, расчитать метрики и улучшить baseline.

# **Импорт данных и библиотек**

In [2]:
!pip install implicit



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k



In [4]:
data = pd.read_csv('/content/sample_data/retail_train.csv')
data.head(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364.0,-0.6,1631.0,1.0,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364.0,0.0,1631.0,1.0,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364.0,-0.3,1631.0,1.0,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364.0,0.0,1631.0,1.0,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364.0,-0.39,1631.0,1.0,0.0,0.0
5,2375,26984851516,1,826249,2,1.98,364.0,-0.6,1642.0,1.0,0.0,0.0
6,2375,26984851516,1,1043142,1,1.57,364.0,-0.68,1642.0,1.0,0.0,0.0
7,2375,26984851516,1,1085983,1,2.99,364.0,-0.4,1642.0,1.0,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364.0,0.0,1642.0,1.0,0.0,0.0
9,2375,26984851516,1,6423775,1,2.0,364.0,-0.79,1642.0,1.0,0.0,0.0


In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

# **Weighted Random Recommendation**

Создадим код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(10)

Unnamed: 0,user_id,actual
0,1,"[840361, 852662, 856942, 865456, 877391, 89309..."
1,2,"[897674, 860299, 914190, 978332, 1075368]"
2,3,"[827683, 828557, 868764, 878996, 883003, 88569..."
3,4,"[912669, 936470, 1070928, 1088145, 6391088, 67..."
4,5,"[829621, 1004385, 1010259, 1076889, 1112387, 5..."
5,6,"[8204101, 863447, 896369, 981760, 1041259, 108..."
6,7,"[826249, 828867, 847573, 862682, 866211, 87963..."
7,8,"[825994, 829722, 834826, 837579, 841179, 84214..."
8,9,"[826249, 851683, 896085, 907647, 910032, 10297..."
9,13,"[826249, 922307, 941734, 1069621, 6534178, 967..."


In [7]:
def weighted_random_recommendation(items, items_weights, n=5):
    """Случайные рекомендации

    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """

    items = np.array(items)
    items_weights = np.array(items_weights)
    recs = np.random.choice(items, n, replace=False, p = items_weights)

    return recs.tolist()

In [8]:
data_train['probability'] = 1 / len(data_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train['probability'] = 1 / len(data_train)


In [9]:
items = data_train.item_id.unique()
items_weights = data_train.groupby(['item_id']).sum(['probability'])

In [10]:
items_weights = items_weights['probability']

In [11]:
items_weights

item_id
25671       2.126530e-06
26081       7.088433e-07
26190       7.088433e-07
26355       7.088433e-07
26426       7.088433e-07
                ...     
14111400    7.088433e-07
15449423    7.088433e-07
15449950    7.088433e-07
15452693    7.088433e-07
15452874    7.088433e-07
Name: probability, Length: 69956, dtype: float64

In [12]:
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, items_weights, n=5))
result.head(10)

Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[840361, 852662, 856942, 865456, 877391, 89309...","[33247, 13039978, 1029909, 1068705, 9297364]"
1,2,"[897674, 860299, 914190, 978332, 1075368]","[13095714, 885070, 1100813, 1105917, 973679]"
2,3,"[827683, 828557, 868764, 878996, 883003, 88569...","[855165, 5574708, 5573583, 2059618, 1202772]"
3,4,"[912669, 936470, 1070928, 1088145, 6391088, 67...","[701054, 12351815, 944216, 8248987, 698804]"
4,5,"[829621, 1004385, 1010259, 1076889, 1112387, 5...","[1128687, 13007033, 5564901, 1110458, 975134]"
5,6,"[8204101, 863447, 896369, 981760, 1041259, 108...","[898363, 853038, 963815, 951246, 1456282]"
6,7,"[826249, 828867, 847573, 862682, 866211, 87963...","[1109714, 8118542, 933703, 9552971, 922307]"
7,8,"[825994, 829722, 834826, 837579, 841179, 84214...","[2673691, 9799043, 6424171, 8204027, 9220230]"
8,9,"[826249, 851683, 896085, 907647, 910032, 10297...","[1125892, 832298, 997040, 1099638, 631812]"
9,13,"[826249, 922307, 941734, 1069621, 6534178, 967...","[1002236, 10144031, 983339, 878467, 1067346]"


# **Расчет метрик**

Рассчитаем Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Оценим, какой алгоритм показывает лучшее качество.

In [13]:
result = pd.read_csv('/content/sample_data/predictions_basic.csv')
result.head(10)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[13416054, 936084, 7410040, 9527114, 377218]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 860776, 854852, 1068719]","[1098066, 826249, 860776, 1068719, 916122]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[5574336, 990072, 868548, 995880, 842226]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 883404, 1004906, 859075]","[981760, 883404, 1098066, 859075, 916122]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[1277401, 94446, 3133282, 1925252, 855699]","[6534178, 6533889, 1029743, 6534166, 1082185]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 5588666, 1096036, 979707, 1013321]","[904360, 1096036, 5588666, 979707, 1013321]","[999999, 1082185, 1029743, 1098066, 6534178]"
5,9,[ 864335 990865 1029743 9297474 10457112 ...,"[966310, 1114417, 896976, 15717035, 1054783]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 883404, 1133018, 1004906, 961554]","[981760, 883404, 961554, 1044078, 844179]","[981760, 883404, 1044078, 961554, 923746]","[999999, 1082185, 1029743, 1098066, 1127831]"
6,13,[ 6534178 1104146 829197 840361 862070 ...,"[911088, 13133787, 10311511, 59994, 15596516]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 883404, 904360, 995785]","[981760, 1098066, 1044078, 883404, 962568]","[981760, 1098066, 1044078, 883404, 995785]","[999999, 1082185, 1029743, 6534178, 1127831]"
7,14,[ 840601 867293 933067 951590 952408 ...,"[1038153, 1119575, 1018818, 6039624, 12487492]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 840361, 1004906, 1106523, 961554]","[1082185, 961554, 1004906, 844179, 840361]","[1082185, 961554, 840361, 844179, 1004906]","[999999, 1029743, 1098066, 6534178, 1127831]"
8,15,[ 910439 1082185 959076 1023958 1082310 ...,"[1014751, 10456468, 10254529, 1219801, 9829777]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 840361, 904360, 883404, 951590]","[1098066, 840361, 883404, 872729, 962568]","[1098066, 840361, 1041259, 872729, 883404]","[999999, 1082185, 1029743, 1127831, 995785]"
9,16,[ 1062973 1082185 13007710],"[1323659, 1127403, 1008237, 1887925, 1004173]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 995242, 1029743, 1127831]","[1082185, 981760, 995242, 1029743, 840361]","[981760, 1082185, 995242, 1029743, 9834988]","[999999, 866227, 12263788, 1035843, 1084551]"


Добавляем print в функцию и посмотрим, что передается в неё.

In [14]:
def precision_at_k(result_recomendation, actual_recomendation, k=5):
    # get top-k recommendations from result_recomendation
    top_k = result_recomendation[:k]
    print(top_k)
    # calculate the number of correct recommendations
    num_correct = len(set(top_k).intersection(actual_recomendation))
    print(num_correct)
    print(set(top_k))
    print(set(actual_recomendation))
    # calculate precision@k
    precision = num_correct / k
    return precision

In [15]:
# Запустим функцию на одной строке

precision_at_k(result.iloc[0]['random_recommendation'], result.iloc[0]['actual'])

[5586
4
{'6', '8', '[', '5'}
{'4', '6', ' ', '1', '3', '7', ']', '5', '8', '[', '9', '0', '\n', '2'}


0.8

In [16]:
result.iloc[0]['actual']

'[  821867   834484   856942   865456   889248   907957   914190   943316\n   951954   954486   958046   962568   969231   971585   979707   986947\n   990656   995242  1004906  1005186  1042083  1050310  1060819  1062002\n  1064441  1069733  1074612  1082185  1131115  1132771  6534544 13876341\n 15971874 17178953   883616   917704   931860   961554  1002032  1031190\n  8090541  8293439  9297615  9527329 15926712  1049998   861272   869465\n   877373   908213   933913   940947   945809   959316   978974  1031697\n  1041796  1048918  1081189  1101422  1115576  1122428  1132231  1132814\n  5577022  8091601  9296986  9677939 10356149 13417048 15741823 15830875]'

In [17]:
# Очистим 'actual' и посмотрим, что получается

result['actual'] = result['actual'].apply(lambda x: [int(el) for el in x.replace('[', '').replace(']', '').split()])

In [18]:
def precision_at_k(result_recomendation, actual_recomendation, k=5):
    # get top-k recommendations from result_recomendation
    top_k = result_recomendation[:k]
    # calculate the number of correct recommendations
    num_correct = len(set(top_k).intersection(actual_recomendation))
    # calculate precision@k
    precision = num_correct / k
    return precision

In [19]:
result.apply(lambda row: precision_at_k(eval(row['random_recommendation']), row['actual']), axis=1).mean()

0.0005876591576885408

Теперь корректно.

In [20]:
result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

0.0

In [21]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()

0.0

In [22]:
result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean()

0.0

In [23]:
result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean()

0.0

In [24]:
result.apply(lambda row: precision_at_k(row['own_purchases'], row['actual']), axis=1).mean()

0.0

# **Улучшение бейзлайнов и ItemItem**

- Попробуем улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуем улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [25]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26190,1
3,26355,2
4,26426,1


In [26]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [27]:
# Создадим фиктивный item_id (если пользователь покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 6666

user_item_matrix = pd.pivot_table(data_train,
                                  index='user_id', columns='item_id',
                                  values='quantity',
                                  aggfunc='count',
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(10)

item_id,6666,51716,202291,397896,420647,480014,818980,819063,819255,819304,...,13040302,13071902,13115493,13115548,13115937,13158064,13189726,13506200,13511722,13512965
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
%%time

model = ItemItemRecommender(K=10, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций
                        filter_already_liked_items=False,
                        filter_items=None,
                        recalculate_user=True)

  0%|          | 0/2497 [00:00<?, ?it/s]

In [None]:
result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in
                    model.recommend(userid=userid_to_id[x],
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5,
                                    filter_already_liked_items=False,
                                    filter_items=None,
                                    recalculate_user=True)])

In [None]:
result.head(10)

In [None]:
result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()