In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from metrics import precision_at_k, recall_at_k

In [5]:
data = pd.read_csv('data/transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0
5,2375,26984851516,1,826249,2,1.98,364,-0.6,1642,1,0.0,0.0
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364,0.0,1642,1,0.0,0.0
9,2375,26984851516,1,6423775,1,2.0,364,-0.79,1642,1,0.0,0.0


In [7]:
item_features = pd.read_csv('data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65..."
1,3,"[823704, 834117, 840244, 913785, 917816, 93870..."


In [10]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [11]:
# Заведем фиктивный item_id

data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999_999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix)

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926885,15926886,15926887,15926927,15927033,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [14]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

# Задание 4

In [17]:
factors_variants = [32, 64, 128]
regularization_variants = [0.05, 0.1]
iterations_variants = [10, 15, 30]

In [25]:
%%time

best_score = 0.0
best_factors = 0
best_regularization = 0
best_iterations = 0

for i in factors_variants:
    for j in regularization_variants:
        for k in iterations_variants:
            model = AlternatingLeastSquares(factors=i, 
                                            regularization=j,
                                            iterations=k,
                                            calculate_training_loss=True, 
                                            num_threads=10,
                                            use_gpu=False)

            print(f'Trying factors = {i}, ' +
                f'regularization = {j}, ' + \
                f'iterations = {k}')

            model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
                    show_progress=True)

            result['als'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

            score = result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

            if score > best_score:
                best_score = score
                best_factors = i
                best_regulatization = j
                best_iterations = k

print(f'Best score = {round(best_score, 2)}, factors = {best_factors}, ' +
    f'regularization = {best_regulatization}, ' + \
    f'iterations = {best_iterations}')

Trying factors = 32, regularization = 0.05, iterations = 10


100%|██████████| 10/10 [00:11<00:00,  1.12s/it, loss=0.0556]


Trying factors = 32, regularization = 0.05, iterations = 15


100%|██████████| 15/15 [00:16<00:00,  1.12s/it, loss=0.055]


Trying factors = 32, regularization = 0.05, iterations = 30


100%|██████████| 30/30 [00:34<00:00,  1.13s/it, loss=0.0541]


Trying factors = 32, regularization = 0.1, iterations = 10


100%|██████████| 10/10 [00:11<00:00,  1.13s/it, loss=0.0556]


Trying factors = 32, regularization = 0.1, iterations = 15


100%|██████████| 15/15 [00:16<00:00,  1.12s/it, loss=0.0548]


Trying factors = 32, regularization = 0.1, iterations = 30


100%|██████████| 30/30 [00:33<00:00,  1.13s/it, loss=0.0541]


Trying factors = 64, regularization = 0.05, iterations = 10


100%|██████████| 10/10 [00:13<00:00,  1.32s/it, loss=0.0499]


Trying factors = 64, regularization = 0.05, iterations = 15


100%|██████████| 15/15 [00:19<00:00,  1.33s/it, loss=0.0491]


Trying factors = 64, regularization = 0.05, iterations = 30


100%|██████████| 30/30 [00:40<00:00,  1.33s/it, loss=0.0482]


Trying factors = 64, regularization = 0.1, iterations = 10


100%|██████████| 10/10 [00:13<00:00,  1.34s/it, loss=0.05]


Trying factors = 64, regularization = 0.1, iterations = 15


100%|██████████| 15/15 [00:20<00:00,  1.34s/it, loss=0.0491]


Trying factors = 64, regularization = 0.1, iterations = 30


100%|██████████| 30/30 [00:40<00:00,  1.33s/it, loss=0.0482]


Trying factors = 128, regularization = 0.05, iterations = 10


100%|██████████| 10/10 [00:15<00:00,  1.52s/it, loss=0.0422]


Trying factors = 128, regularization = 0.05, iterations = 15


100%|██████████| 15/15 [00:22<00:00,  1.52s/it, loss=0.0415]


Trying factors = 128, regularization = 0.05, iterations = 30


100%|██████████| 30/30 [00:45<00:00,  1.53s/it, loss=0.0406]


Trying factors = 128, regularization = 0.1, iterations = 10


100%|██████████| 10/10 [00:15<00:00,  1.52s/it, loss=0.0425]


Trying factors = 128, regularization = 0.1, iterations = 15


100%|██████████| 15/15 [00:22<00:00,  1.53s/it, loss=0.0415]


Trying factors = 128, regularization = 0.1, iterations = 30


100%|██████████| 30/30 [00:45<00:00,  1.52s/it, loss=0.0406]


Best score = 0.16, factors = 32, regularization = 0.05, iterations = 10
CPU times: user 33min 30s, sys: 9min 7s, total: 42min 38s
Wall time: 11min 18s


# Задание 1

__Вопрос:__ Будем отправлять одному юзеру много раз наши рекоммендации. Как добиться того, чтобы они хоть немного отличались?

__Ответ:__ Для этого надо перед каждыми массовыми рассылками заново обучать модель рекомендации на свежих данных.

__Вопрос:__ Нужно ли, чтобы в одной рассылке были *разные* товары? Как определить, что товары *разные*? Как добиться того, чтобы они были разными?

__Ответ:__ В одной рассылке должны быть разные товары, чтобы повысить вероятность заинтересованности. Чем более разные товары, тем больше затрагивается потенциальной области интересов пользователя. Разные товары, они из разных подкатегорий (например, шампуни, шампанское). Если товары из одной подкатегории, но от разных производителей, то такие товары должны считаться одинаковыми. Честно говоря, я пока не знаю, как добиться чтобы в рекомендации из 10 товаров попадались только уникальные подкатегории. Думаю, стоит порекомендовать 100 товаров и из них взять только 10 товаров с уникальными подкатегориями, хотя это костыль.