In [1]:
import pandas as pd
import numpy as np
import re # пришлось использовать, т.к. при импорте датасета, сохранённого из тетрадки 2го вебинара, некооректно подтягивались значения
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [4]:
data = pd.read_csv('../../retail_train.csv')
data.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [6]:
data_train.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0,2278490.0
mean,1271.764,32945260000.0,349.1402,2791955.0,100.6171,3.09511,2992.061,-0.5393603,1562.467,50.56328,-0.01646478,-0.002915685
std,726.9816,3964679000.0,167.6271,3673791.0,1153.002,4.196106,8693.638,1.23608,402.5741,23.94798,0.2179563,0.03995998
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,654.0,30035460000.0,208.0,916767.0,1.0,1.27,330.0,-0.69,1306.0,30.0,0.0,0.0
50%,1271.0,32149760000.0,351.0,1027068.0,1.0,2.0,370.0,-0.02,1615.0,51.0,0.0,0.0
75%,1914.0,34338250000.0,494.0,1131351.0,1.0,3.49,422.0,0.0,1846.0,71.0,0.0,0.0
max,2500.0,41297770000.0,635.0,17829230.0,89638.0,840.0,34280.0,3.99,2359.0,91.0,0.0,0.0


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [7]:
# создаём сводную таблицу с показателями валовых продаж в деньгах по каждому объекту (товару)
popularity = data_train.groupby('item_id')['sales_value'].sum().reset_index()
popularity.sort_values(by='sales_value', ascending=False, inplace=True)
popularity.rename(columns={'sales_value': 'gross_sales'}, inplace=True)

# подсчитываем сумму продаж ВСЕХ товаров
total_sum = popularity['gross_sales'].sum()

# посчитаем коэф-т по каждому товару
popularity['weight']=popularity['gross_sales']/total_sum
popularity.head(10)

Unnamed: 0,item_id,gross_sales,weight
55470,6534178,447799.94,0.063498
55430,6533889,40483.34,0.005741
28895,1029743,35764.66,0.005071
55465,6534166,30170.77,0.004278
34707,1082185,26029.96,0.003691
16343,916122,25861.84,0.003667
55421,6533765,24764.68,0.003512
37343,1106523,24293.86,0.003445
25064,995242,22937.19,0.003252
51478,5569230,20494.05,0.002906


In [8]:
# создадим вектор весов
item_weights = popularity['weight']

# проверим, что сумма весов по всем товарам = 1
item_weights.sum()

1.0

In [9]:
# создаём вектор объектов (товаров)
items_id = popularity['item_id']

In [10]:
# функция взвешенных k случайных рекомендаций. На вход: 1) вектор товаров (объектов) 2) вектор весов 3) k=членов
def weighted_random_recommendation(items_id, items_weights, n=5):   
    recs = np.random.choice(items_id, size=n, p=items_weights, replace=False)
    return recs.tolist()

In [11]:
recommendations = weighted_random_recommendation(items_id, item_weights, n=5)
print(f'5 случайных взвешенных рекомендаций:\n',recommendations)

5 случайных взвешенных рекомендаций:
 [914512, 5568789, 864893, 1028296, 1002771]


In [12]:
# проверим, что за веса у выбранных нами на предыдущем шаге item_id
popularity.loc[popularity['item_id'].isin (recommendations)]

Unnamed: 0,item_id,gross_sales,weight
25924,1002771,1730.07,0.000245
10695,864893,1313.11,0.000186
51399,5568789,352.79,5e-05
28732,1028296,114.54,1.6e-05
16174,914512,69.95,1e-05


In [13]:
# ресурсы про настройку взвешенной случайной выборки
# https://pynative.com/python-weighted-random-choices-with-probability/
# https://www.python-course.eu/weighted_choice_and_sample.php
# https://www.geeksforgeeks.org/how-to-get-weighted-random-choice-in-python/
# https://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choicea

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [14]:
def precision_at_k(recommended_list, bought_list, k=5):
        
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    #print(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list#[:k]
    
    flags = np.isin(bought_list, recommended_list)
    #print(flags)
    
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [16]:
# загрузка данных из файла
result = pd.read_pickle('predictions_basic.pickle')
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[14026177, 973880, 1089820, 949556, 852282]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13073225, 1388846, 845538, 5996651, 222031]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[868389, 833260, 10121970, 868907, 9686413]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[10312288, 13943490, 45800, 858687, 10210707]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1104245, 948832, 984591, 884118, 8119018]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]"


In [17]:
# сформируем список названий столбцов, в которых содержатся рекомендации
col_names = result.columns[2:]
col_names = col_names[:6]
col_names

Index(['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine',
       'tfidf', 'own_purchases'],
      dtype='object')

In [18]:
# создаём новые столбцы и рассчитываем для каждого юзера метрику "precision at k"
for col_name in col_names:
    new_name = col_name + '_prec'
    print(col_name, new_name)
    result[new_name ]=None
    for index, row in result.iterrows():
        acts = row['actual']
        recs = row[col_name]
        prec = precision_at_k(recs, acts,5)
        result[new_name][index] = prec

random_recommendation random_recommendation_prec


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


popular_recommendation popular_recommendation_prec
itemitem itemitem_prec
cosine cosine_prec
tfidf tfidf_prec
own_purchases own_purchases_prec


In [19]:
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,random_recommendation_prec,popular_recommendation_prec,itemitem_prec,cosine_prec,tfidf_prec,own_purchases_prec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[14026177, 973880, 1089820, 949556, 852282]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0,0.2,0.4,0.2,0.2,0.4
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[13073225, 1388846, 845538, 5996651, 222031]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]",0,0.0,0.0,0.0,0.0,0.0
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[868389, 833260, 10121970, 868907, 9686413]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]",0,0.0,0.2,0.0,0.0,0.0
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[10312288, 13943490, 45800, 858687, 10210707]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]",0,0.2,0.2,0.4,0.4,0.2
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1104245, 948832, 984591, 884118, 8119018]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]",0,0.4,0.2,0.2,0.2,0.4


In [20]:
# выводим среднюю точность по каждой методике
for col_name in col_names:
    prec_col_name = col_name+"_prec"
    print(prec_col_name, result[prec_col_name].mean())
#     print(f'метрика по {col_name}: {result[col_name].sum()}, среднее {result[col_name].mean}')

random_recommendation_prec 0.0013712047012732617
popular_recommendation_prec 0.15523996082272082
itemitem_prec 0.13692458374142857
cosine_prec 0.13290891283055686
tfidf_prec 0.1389813907933383
own_purchases_prec 0.17969311132876015


### лучшим алгоритмом оказался own_purchases и popular_recommendation

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов

In [None]:
#### план действий:
1. Создадим датафрейм с топ-5000 (по продажам в деньгах) товаров