In [1]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/8.9 MB[0m [31m25.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/8.9 MB[0m [31m49.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m8.6/8.9 MB[0m [31m82.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m8.9/8.9 MB[0m [31m82.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys
import inspect
import shutil
from math import log

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

src_dir = 'src'
os.makedirs(src_dir, exist_ok=True)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

In [116]:
data = pd.read_csv('transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [14]:
popularity = data_train.groupby('item_id')['user_id'].nunique().reset_index().sort_values('user_id')  / data_train['user_id'].nunique()

In [104]:
item_features = pd.read_csv('product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65..."
1,3,"[823704, 834117, 840244, 913785, 917816, 93870..."


###Очистка данных

In [117]:
def prefilter_items(data):

    n_items_before = data['item_id'].nunique()

    # Уберем самые популярные товары (их и так купят)
    popularity = data_train.groupby('item_id')['user_id'].nunique().reset_index() / data_train['user_id'].nunique() #вычисляем популярность товаров (сколько пользователей купило / общее кол-во пользователей)
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist() #берем список топ-товаров с популярностью > 0.5
    data = data[~data['item_id'].isin(top_popular)] # убираем из данных транзакции по топ-товарам

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist() #берем список НЕтоп-товаров с популярностью < 0.01
    data = data[~data['item_id'].isin(top_notpopular)] # убираем из данных транзакции по НЕтоп-товарам

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем товары, которые не продавались за последние 12 месяцев (52 недели)
    data = data[data['week_no'] > 50]

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем не интересные для рекоммендаций категории (department)
    category = pd.DataFrame(item_features.groupby('department')['item_id'].count().sort_values(ascending=False)).reset_index()
    category.columns = ['department', 'count_items']
    category = category[category['count_items'] > 150]

    top_category_items = item_features[item_features['department'].isin(category)].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    data['price'] = data['sales_value'] / data['quantity']
    data = data[data['price'] > 2]

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем товары с нулем в quantity
    data = data[~(data['quantity'] == 0)]

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    # Уберем слишком дорогие товары
    data = data[data['price'] <= 50]

    n_items_after = data['item_id'].nunique()
    print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

    return data

In [118]:
data_train = prefilter_items(data_train)

Decreased # items from 90386 to 90386
Decreased # items from 90386 to 90386
Decreased # items from 90386 to 64770
Decreased # items from 90386 to 64770
Decreased # items from 90386 to 43608
Decreased # items from 90386 to 43586
Decreased # items from 90386 to 43526


###Сделаем матрицу взаимодействий

In [154]:
user_item_matrix = pd.pivot_table(data_train,
                                  index='user_id', columns='item_id',
                                  values='quantity',
                                  aggfunc='count',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float)

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,25671,26636,26889,27764,27812,27951,27978,28018,28041,28158,...,18106229,18107424,18118989,18119016,18119983,18120172,18120717,18120821,18121351,18183625
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
user_item_matrix.shape

(2468, 43526)

In [156]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [121]:
#user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице !

In [219]:
%%time

model = AlternatingLeastSquares(factors=20,
                                regularization=1000,
                                iterations=5,
                                calculate_training_loss=True,
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 2min 6s, sys: 3min 10s, total: 5min 16s
Wall time: 2min 54s


### Перенос всех метрик в модуль src.metrics.py

In [129]:
def hit_rate(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(recommended_list, bought_list)
    hit_rate = int(flags.sum() > 0)

    return hit_rate


def hit_rate_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(recommended_list[:k], bought_list,)
    hit_rate = int(flags.sum() > 0)

    return hit_rate

def precision(recommended_list, bought_list):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)

    return precision


def precision_at_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    bought_list = bought_list
    recommended_list = recommended_list[:k]

    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)


    return precision


def money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)

    bought_list = bought_list
    recommended_list = recommended_list[:k]
    prices_recommended = prices_recommended[:k]

    flags = np.isin(bought_list, recommended_list)
    precision = (flags*prices_recommended).sum() / prices_recommended.sum()

    return precision

def recall(recommended_list, bought_list):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(bought_list, recommended_list)

    recall = flags.sum() / len(bought_list)

    return recall


def recall_at_k(recommended_list, bought_list, k = 4):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(bought_list, recommended_list[:k])
    recall = flags.sum() / len(bought_list)
    return recall


def money_recall_at_k(recommended_list, bought_list, prices, k = 4):
    all_revenue = get_revenue = 0

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(bought_list, recommended_list[:k])

    for i in range(len(bought_list)):
      if flags[i]:
        get_revenue += int(prices[prices['item_id'] == bought_list[i]].price)

    for i in range(len(bought_list)):
      all_revenue += int(prices[prices['item_id'] == bought_list[i]].price)

    recall = get_revenue / all_revenue

    return recall

def ap_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    flags = np.isin(recommended_list, bought_list)

    if sum(flags) == 0:
        return 0

    sum_ = 0
    for i in range(k):

        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k

    result = sum_ / k

    return result

def mrr_at_k(recommended_list, bought_list, k = 4):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    rank = 0

    flags = np.isin(recommended_list[:k], bought_list)

    for i in range(len(recommended_list[:k])):
      if flags[i]:
        rank = 1 / (i + 1)
        break

    return rank

def ndcg_at_k(recommended_list, bought_list, k=4):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    DCG = 0
    IDCG = 0

    flags = np.isin(recommended_list[:k], bought_list)

    for i in range(len(flags)):
      if flags[i] == 1:
        if i == 0:
          DCG += 1
        else:
          DCG += 1 / log(i + 2)

    for i in range(len(bought_list)):
      if i == 0:
        IDCG += 1
      else:
        IDCG += 1 / log(i + 2)

    nDCG = DCG / IDCG

    return nDCG

In [131]:
functions_to_move = [
    hit_rate,
    hit_rate_at_k,
    precision,
    precision_at_k,
    money_precision_at_k,
    recall,
    recall_at_k,
    money_recall_at_k,
    ap_k,
    mrr_at_k,
    ndcg_at_k
]

with open(os.path.join(src_dir, 'metrics.py'), 'a', encoding='utf-8') as f:
    for function in functions_to_move:
        function_source = inspect.getsource(function)
        f.write(function_source)
        f.write('\n')

### Перенос функции prefilter_items в модуль src.utils.py

In [126]:
functions_to_move = [
    prefilter_items
]

with open(os.path.join(src_dir, 'utils.py'), 'a', encoding='utf-8') as f:
    for function in functions_to_move:
        function_source = inspect.getsource(function)
        f.write(function_source)
        f.write('\n')

In [140]:
class MainRecommender:

    def __init__(self, data, weighting=True):

        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать

        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def prepare_matrix(data):

        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='quantity',
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)

        return user_item_matrix

    @staticmethod
    def prepare_dicts(user_item_matrix):

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors, regularization=0.001, iterations=5, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                             regularization=regularization,
                                             iterations=iterations,
                                             num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        user_id = self.userid_to_id[user]
        similar_users = self.own_recommender.similar_users(user_id, N+1)

        similar_users = similar_users[1:]

        items = []
        for similar_user_id in similar_users:
            recs = self.own_recommender.recommend(similar_user_id, self.user_item_matrix.T.tocsr(), N=1)
            items.append(self.id_to_itemid[recs[0][0]])

        res = list(set(items))

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        user_id = self.userid_to_id[user]

        top_items = self.user_item_matrix.loc[user_id].sort_values(ascending=False).head(N)

        similar_items = []
        for item_id, score in top_items.iteritems():
            recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)
            similar_items.extend([self.id_to_itemid[rec[0]] for rec in recs[1:]])

        res = list(set(similar_items))

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [150]:
# class_to_move = [
#     MainRecommender
# ]

# with open(os.path.join(src_dir, 'recommenders.py'), 'a', encoding='utf-8') as f:
#     for my_class in class_to_move:
#         class_source = inspect.getsource(my_class)
#         f.write(class_source)
#         f.write('\n')

Проверка, что все работает

In [151]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender