In [2]:
!pip install implicit



In [491]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k
from math import log
import random

In [493]:
pd.set_option('max_colwidth', 400)

In [494]:
data = pd.read_csv('retail_train.csv')
data

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,41655820646,663,16102849,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,41655829421,663,13217063,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,41655829421,663,13217800,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,41656790510,663,6410462,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [495]:
data['week_no'].nunique()

95

In [496]:
my_result = data.groupby('user_id')['item_id'].unique().reset_index()
my_result.columns=['user_id', 'actual']
my_result.head(5)

Unnamed: 0,user_id,actual
0,1,"[825123, 831447, 840361, 845307, 852014, 854982, 856942, 912676, 940947, 945805, 958046, 977545, 991024, 1006546, 1024128, 1043064, 1055201, 1055341, 1056509, 1056775, 1074612, 1082185, 1131115, 5566027, 5577022, 7167962, 7170493, 7410037, 7431134, 8069064, 852662, 997025, 1030547, 1049998, 1055831, 6773032, 8090541, 9297615, 9527290, 841266, 865178, 953561, 995242, 995904, 1033142, 1075074, 1..."
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 5567582, 5568489, 5569230, 9365106, 846833, 978318, 899824, 901606, 1075368, 1133018, 824005, 833715, 839753, 847573, 853904, 866950, 874972, 903674, 904236, 941183, 958252, 979746, 1003421, 1022843, 1053924, 1119879, 9416729, 9707240, 32124, 32916, 33441, 65636, 100794, 105289, 107358, 170879, 244960, 406740, 845078, 933835, 935317, 951590, 999389, ..."
2,3,"[866211, 878996, 882830, 904360, 921345, 931940, 937791, 951590, 964968, 1082185, 1089954, 1092026, 1114597, 1118235, 1121321, 1133018, 12132312, 916381, 933248, 1020581, 1106523, 9337369, 9337581, 822346, 826385, 833715, 835347, 864615, 868953, 869573, 877358, 932761, 945090, 964594, 965267, 974306, 989935, 996458, 1001788, 1003188, 1003421, 1018859, 1049708, 1061885, 1079319, 5592931, 603496..."
3,4,"[836163, 857849, 877523, 878909, 883932, 891423, 897125, 901032, 904973, 907631, 910109, 933246, 938566, 939907, 940766, 943030, 944534, 955018, 964968, 997089, 1001953, 1013868, 1029743, 1044078, 1050320, 1053754, 1056778, 1076056, 1086560, 1090711, 1096317, 1121367, 1135995, 5569230, 6391541, 7466859, 9677093, 10282046, 822178, 824546, 835530, 868075, 871741, 887003, 887393, 889212, 902172, ..."
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 6904776, 870826, 889509, 911270, 927491, 937626, 941797, 961353, 1004596, 1008173, 1114597, 1126899, 1130951, 9487553, 835986, 866292, 894544, 901557, 925054, 946308, 952254, 962850, 992765, 1009837, 1022011, 1022053, 1050851, 1071939, 1119060, 1130581, 5995589, 9553284, 9878869, 1084591, 925607, 843084, 982386, 1117059, 5995609, 9420336, 842236, 891..."
...,...,...
2494,2496,"[840361, 852159, 871756, 886703, 899624, 916122, 948272, 953339, 956609, 998239, 999270, 1000753, 1020581, 1056509, 1077231, 1084590, 1095486, 1105467, 1138132, 6463742, 6533902, 6534177, 6602365, 859984, 866227, 890401, 921863, 1061416, 1098708, 9553047, 825343, 835922, 849589, 865178, 932529, 961554, 985480, 1006184, 1032023, 1037337, 1042907, 1043095, 1057836, 1083721, 1085803, 1110572, 112..."
2495,2497,"[838220, 1037840, 1052294, 5569230, 8090537, 1022428, 5569471, 5569845, 8177622, 849202, 906625, 908318, 5591083, 6904419, 820133, 843343, 871514, 1030577, 1108168, 1077659, 1101956, 961379, 1009977, 1004436, 1018457, 7147162, 900802, 951590, 954675, 1135834, 5585510, 1031864, 892531, 1065067, 1119089, 883404, 1081177, 844165, 1116068, 5582527, 948420, 1124729, 874972, 912914, 942525, 947292, ..."
2496,2498,"[824555, 835576, 901776, 904023, 911215, 917494, 935302, 951526, 972742, 1005186, 1031697, 1036432, 1070820, 9677846, 1044500, 1049832, 5576075, 9526411, 9527290, 963234, 984140, 894236, 908531, 945909, 828737, 907303, 960862, 985480, 1025457, 1052603, 1116663, 8090449, 9221006, 1106523, 860487, 119230, 7413235, 6704135, 870608, 1021709, 6773212, 12384332, 12384697, 1126899, 828867, 986912, 10..."
2497,2499,"[838186, 853197, 864143, 883665, 932949, 933835, 1067695, 1132771, 903230, 1044655, 821344, 822346, 826249, 831763, 835819, 846417, 846830, 851287, 851676, 856827, 857612, 859075, 861279, 866292, 880007, 880150, 884039, 885858, 886787, 893018, 897954, 898212, 899624, 901460, 909130, 913785, 919902, 924804, 931911, 937571, 944317, 944466, 947798, 952698, 953476, 961772, 966660, 967762, 971949, ..."


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [498]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.columns=['item_id', 'sales_sum']

In [499]:
popularity['weight'] = popularity['sales_sum'].apply(lambda x: np.log(x + 1))

In [500]:
popularity.sort_values('sales_sum')

Unnamed: 0,item_id,sales_sum,weight
44237,1416192,0.00,0.000000
85393,15596891,0.00,0.000000
79271,13072908,0.00,0.000000
66107,9526347,0.00,0.000000
32977,1063554,0.00,0.000000
...,...,...,...
35054,1082185,27291.02,10.214350
56228,6534166,31298.96,10.351372
29195,1029743,37981.91,10.544892
56193,6533889,42645.75,10.660706


In [501]:
def weighted_random_recommendation(items_weights, n):

    selected_item = random.choices(items_weights['item_id'], items_weights['weight'], k=n)

    return selected_item

In [502]:
%%time

my_result['weighted_random'] = my_result['user_id'].apply(lambda x: weighted_random_recommendation(popularity, 5))

CPU times: user 29.3 s, sys: 88.1 ms, total: 29.4 s
Wall time: 29.7 s


In [503]:
my_result['weighted_random'][5]

[15972849, 1028088, 835973, 820936, 1039455]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [560]:
result = pd.read_csv('predictions_basic.csv')
result.head(5)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 907957 914190 943316\n 951954 954486 958046 962568 969231 971585 979707 986947\n 990656 995242 1004906 1005186 1042083 1050310 1060819 1062002\n 1064441 1069733 1074612 1082185 1131115 1132771 6534544 13876341\n 15971874 17178953 883616 917704 931860 961554 1002032 1031190\n 8090541 8293439 929761...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 909638 913202 920626\n 958154 994891 1053690 1083328 1096727 6463658 7167218 7167249\n 9526563 9526886 13842214],"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 1078346 1104227 1108624\n 1110392 1120741 13776852 17105539 825541 870315 873654 874149\n 956672 1071939 5580166 6979393 14106553 909479 7431990 6553035\n 12263667 13382461 1329768 820165 847270 849843 948650 997284\n 1123407 897811 14111230 822812 840361 845208 995242 1015296\n 1024306 1037863 105591...,"[13416054, 936084, 7410040, 9527114, 377218]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 860776, 854852, 1068719]","[1098066, 826249, 860776, 1068719, 916122]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,[ 840386 889774 898068 909714 929067 953476 954543 976998\n 993838 994928 1003188 1056418 1064054 1064441 1073224 1082185\n 1106523 1108844 1110779 1122085 1122358 1126899 1135694 5571310\n 5590695 6034857 10121622 10255525 10285022 10285186 12731506 14111027\n 14111539 15452140 16809431 945909 966058 986912 1018769 1117035\n 850841 1085476 1269602...,"[5574336, 990072, 868548, 995880, 842226]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 883404, 1004906, 859075]","[981760, 883404, 1098066, 859075, 916122]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,[ 835098 872137 910439 924610 992977 1041259 5569230 5569471\n 9337369 9337581 13071586 15629920 819255 823356 837579 839243\n 840361 848319 858001 862070 868984 892618 905690 911812\n 913425 914190 919180 920421 937573 939275 945805 946484\n 953476 966103 969932 1012050 1012816 1043230 1048237 1070080\n 1085095 1115801 111657...,"[1277401, 94446, 3133282, 1925252, 855699]","[6534178, 6533889, 1029743, 6534166, 1082185]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 5588666, 1096036, 979707, 1013321]","[904360, 1096036, 5588666, 979707, 1013321]","[999999, 1082185, 1029743, 1098066, 6534178]"
...,...,...,...,...,...,...,...,...
2037,2496,[6534178],"[932962, 845876, 1578206, 13159156, 175031]","[6534178, 6533889, 1029743, 6534166, 1082185]","[995242, 1127831, 1029743, 1004906, 866211]","[1127831, 1004906, 962568, 995242, 900698]","[1127831, 1004906, 995242, 995303, 962568]","[999999, 1082185, 1098066, 6534178, 995785]"
2038,2497,[ 1016709 9835695 1132298 16809501 845294 871756 873654 1023226\n 1068719 1078912 1115187 1127338 1134222 1135834 5569230 6961763\n 6979579 7025204 7441679 8091550 8177622 8291322 9363315 9670830\n 12731684 12949719 13212942 13511457 15592590 15831255 15972687 1103513\n 1118533 1126899 820361 884896 896938 938139 965719 995134\n 1057855 1077709 83251...,"[5726138, 15717067, 908287, 915356, 5558852]","[6534178, 6533889, 1029743, 6534166, 1082185]","[961554, 908531, 923746, 916122, 1133018]","[961554, 1096036, 923746, 1041259, 933835]","[961554, 1096036, 923746, 1041259, 5592931]","[999999, 1082185, 1029743, 1098066, 6534178]"
2039,2498,[15716530 834484 901776 914190 958382 972437 1039835 1070820\n 1134713 17105483 943316 1079248 903567 7025363 12731413 32439\n 73048 386608 404300 15722019 15722025 15722029 15780991 15781095\n 933389 10121837 850281 1022066 978879 1036347 13115589 836916\n 989824 1072843 1082185 6773190 14112071 16208725 998119],"[9836300, 12326063, 7414863, 1730240, 7441652]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 1127831, 995242, 883404]","[981760, 1098066, 1127831, 859075, 883404]","[981760, 1098066, 1127831, 859075, 995242]","[999999, 1082185, 1029743, 1068719, 1106523]"
2040,2499,[ 867188 877580 902396 914190 951590 958137 976998 1024629\n 1060872 1106091 1114050 1119993 1125271 5568729 5569230 5569471\n 5569845 5570048 5583288 6602711 9526159 9837383 10150194 12670635\n 13654811 13945244],"[849152, 952122, 906965, 16124540, 1008041]","[6534178, 6533889, 1029743, 6534166, 1082185]","[995242, 1127831, 840361, 1029743, 904360]","[1127831, 961554, 840361, 904360, 995785]","[961554, 1127831, 840361, 995242, 995785]","[999999, 1082185, 1098066, 6534178, 1004906]"


Вставлю свой взвешенный рекоммендер в общий список, чтобы посчитать метрику и для него

In [561]:
result.insert(7, "weighted_random", my_result['weighted_random'], True)

In [562]:
def precision(recommended_list, bought_list, k):

    recommended_list = recommended_list[:k]

    flags = np.isin(recommended_list, bought_list)

    precision = flags.sum() / len(recommended_list)

    return precision

In [563]:
result.iloc[[0]]

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,weighted_random,own_purchases
0,1,[ 821867 834484 856942 865456 889248 907957 914190 943316\n 951954 954486 958046 962568 969231 971585 979707 986947\n 990656 995242 1004906 1005186 1042083 1050310 1060819 1062002\n 1064441 1069733 1074612 1082185 1131115 1132771 6534544 13876341\n 15971874 17178953 883616 917704 931860 961554 1002032 1031190\n 8090541 8293439 929761...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[9527323, 826597, 980943, 1082990, 961980]","[999999, 1082185, 1029743, 995785, 1004906]"


Преобразуем столбец "actual" в лист intов:

In [564]:
result['actual'] = result['actual'].apply(lambda x: x.replace('[', '').replace(']', '').replace('\n', '')) # убираем лишние символы

for i in range(result.shape[0]): # убираем двойные пробелы
  while '  ' in result['actual'][i]:
    result['actual'][i] = result['actual'][i].replace('  ', ' ')

for i in range(result.shape[0]): # убираем первый пробел
  if result['actual'][i][0] == ' ':
    result['actual'][i] = result['actual'][i][1:]

result['actual'] = result['actual'].apply(lambda x: x.split(' ')) # разделяем str по пробелам в список

for i in range(result.shape[0]): # переделываем str в int
  for j in range(len(result['actual'][i])):
    result['actual'][i][j] = int(result['actual'][i][j])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['actual'][i] = result['actual'][i].replace('  ', ' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['actual'][i] = result['actual'][i][1:]


Преобразуем все рекоммендеры в лист intов:

In [565]:
recommenders = ['random_recommendation', 'popular_recommendation',	'itemitem',	'cosine',	'tfidf', 'own_purchases']

In [566]:
for recommender in recommenders:
  result[recommender] = result[recommender].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').split(', ')) # убираем лишние символы

  for i in range(result.shape[0]): # переделываем str в int
    for j in range(len(result[recommender][i])):
      result[recommender][i][j] = int(result[recommender][i][j])

In [567]:
all_recommenders = result.columns.to_list()[2:8]

In [568]:
for i in range(len(all_recommenders)):

  print(f'{all_recommenders[i]}: {round(result.apply(lambda x: precision(x[i + 2], x[8], 5),1).mean(), 5)}');

random_recommendation: 0.0002
popular_recommendation: 0.38786
itemitem: 0.00059
cosine: 0.00049
tfidf: 0.00049
weighted_random: 0.00157


Лучшее качество показывает алгоритм popular_recommendation

### Задание 3*. Улучшение бейзлайнов

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров

In [515]:
top_5000 = popularity.sort_values('sales_sum', ascending=False)[:5000]

In [516]:
top_5000

Unnamed: 0,item_id,sales_sum,weight
56233,6534178,467993.62,13.056212
56193,6533889,42645.75,10.660706
29195,1029743,37981.91,10.544892
56228,6534166,31298.96,10.351372
35054,1082185,27291.02,10.214350
...,...,...,...
16333,914188,279.13,5.635254
59752,7410342,279.13,5.635254
29914,1036297,279.07,5.635040
33883,1071845,279.05,5.634968


### 1.1 Random recommendation

In [569]:
items = top_5000.item_id.unique()

In [541]:
def random_recommendation(items, n):

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)

    return recs.tolist()

In [571]:
%%time

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, 5))

CPU times: user 342 ms, sys: 0 ns, total: 342 ms
Wall time: 461 ms


### 1.2 Popularity-based recommendation

In [572]:
def popularity_recommendation(items, n):

    popular = top_5000.groupby('item_id')['sales_sum'].sum().reset_index()
    popular.sort_values('sales_sum', ascending=False, inplace=True)

    recs = popular.head(n).item_id

    return recs.tolist()

In [573]:
%%time

popular_recs = popularity_recommendation(items, 5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

CPU times: user 9.31 ms, sys: 0 ns, total: 9.31 ms
Wall time: 15 ms


### 1.3 Weighted random recommender

In [574]:
def weighted_random_recommendations(items_weights, n):

    selected_item = random.choices(items_weights['item_id'], items_weights['weight'], k=n)

    return selected_item

In [576]:
%%time

result['weighted_random'] = result['user_id'].apply(lambda x: weighted_random_recommendations(top_5000.reset_index(), 5))

CPU times: user 2.48 s, sys: 12.6 ms, total: 2.49 s
Wall time: 2.5 s


Presicion для всех моделей, random, popular и weighted random улучшены:

In [578]:
for i in range(len(all_recommenders)):

  print(f'{all_recommenders[i]}: {round(result.apply(lambda x: precision(x[i + 2], x[8], 5),1).mean(), 5)}');

random_recommendation: 0.00049
popular_recommendation: 0.38786
itemitem: 0.00059
cosine: 0.00049
tfidf: 0.00049
weighted_random: 0.00186
