Цель: научиться использовать библиотеку Implicit для построения
рекомендательной системы, которая учитывает взаимодействия пользователей с
товарами, представленными в неявном виде.

In [1]:
import implicit
import fireducks.pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
from implicit.evaluation import (
    train_test_split,
    precision_at_k,
    ndcg_at_k,
    AUC_at_k
)

In [2]:
pd.read_csv('data/online_retail_II.csv')

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [3]:
df = pd.read_csv(
    'data/online_retail_II.csv',
    parse_dates=['InvoiceDate'],
    dtype={'StockCode': str, 'Description': str, 'Customer ID': str}
)

df['TotalPrice'] = df['Quantity'] * df['Price']

df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalPrice
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


1. Выделить признаки покупателей.\
В качестве таких признаков можно использовать уже данные признаки, а
также необходимо определить минимум еще два косвенных признака, на
основе анализа данных.

In [4]:
cust = df.groupby('Customer ID').agg(
    country = ('Country', 'first')
)


agg = df.groupby('Customer ID').agg(
    total_spent  = ('TotalPrice', 'sum'),
    n_transactions = ('Invoice', 'nunique'),
    unique_products = ('StockCode', 'nunique'),
    avg_qty_per_tx = ('Quantity', 'mean')
)

customer_features = cust.join(agg)
customer_features.head()

Unnamed: 0_level_0,country,total_spent,n_transactions,unique_products,avg_qty_per_tx
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346.0,United Kingdom,-64.68,17,30,1.083333
12347.0,Iceland,5633.32,8,126,12.988142
12348.0,Finland,2019.4,5,25,53.215686
12349.0,Italy,4404.54,5,139,8.994444
12350.0,Norway,334.4,1,17,11.588235


2. Выделить признаки продуктов.\
В качестве такого признака можно определить рейтинг продукта на основе,
например, количества его покупок и/или частоту его продаж в
определенные дни недели.

In [5]:
prod = df.groupby('StockCode').agg(
    description = ('Description', 'first'),
    unit_price = ('Price', 'mean')
)

df['weekday'] = df['InvoiceDate'].dt.day_name()
weekday = df.pivot_table(
    index='StockCode',
    columns='weekday',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

agg_prod = df.groupby('StockCode').agg(
    total_sold = ('Quantity', 'sum'),
    n_customers = ('Customer ID','nunique')
)

product_features = prod.join(agg_prod).join(weekday)
product_features.head()

Unnamed: 0_level_0,description,unit_price,total_sold,n_customers,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10002,INFLATABLE POLITICAL GLOBE,0.97915,7790,164,1757,1048,0,448,1928,1222,1387
10002R,ROBOT PENCIL SHARPNER,5.133333,4,0,2,1,0,0,0,0,1
10080,GROOVY CACTUS INFLATABLE,0.456129,597,23,31,88,0,3,68,370,37
10109,BENDY COLOUR PENCILS,0.21,0,1,0,0,0,0,0,0,0
10120,DOGGY RUBBER,0.23519,-8350,52,159,90,0,71,110,-8931,151


3. На основе выделенных признаков построить матрицу
взаимодействий:\
➢ Строки — покупатели, столбцы — товары, значения — оценки или
факты взаимодействий (например, покупки).\
➢ Матрица может быть разреженной, например, в формате CSR
(Compressed Sparse Row).

In [6]:
cust_ids = customer_features.index.tolist()
prod_ids = product_features.index.tolist()
cust_idx = {cid:i for i, cid in enumerate(cust_ids)}
prod_idx = {pid:i for i, pid in enumerate(prod_ids)}

inter = df.groupby(['Customer ID','StockCode'])['Quantity'].sum().reset_index()

rows = inter['Customer ID'].map(cust_idx)
cols = inter['StockCode'].map(prod_idx)
data = inter['Quantity']  

M = csr_matrix(
    (data, (rows, cols)),
    shape=(len(cust_ids), len(prod_ids))
)

user_item = M.copy()
user_item.data = np.ones_like(user_item.data)
user_item = user_item.tocsr() 

print('Матрица взаимодействий:', M.shape)
print('Ненулевых элементов:', M.nnz)

Матрица взаимодействий: (5942, 5305)
Ненулевых элементов: 483232


2. Обучение модели:\
o Используйте алгоритмы ALS (Alternate Least Squares) и BRP для
обучения модели.

In [7]:
M_bin = M.copy()
M_bin.data = np.ones_like(M_bin.data)

item_user = M_bin.T.tocsr()

als = implicit.als.AlternatingLeastSquares(
    factors=20,
    regularization=0.1,
    iterations=20,
    calculate_training_loss=True
)

als.fit(user_item)

user_idx = 0
user_items = M_bin.tocsr()
recs_als = als.recommend(user_idx, user_items[user_idx], N=5)

recs_als

  0%|          | 0/20 [00:00<?, ?it/s]

(array([3406, 3401, 3392, 1466, 2577], dtype=int32),
 array([0.6008451 , 0.46363816, 0.3559905 , 0.3184213 , 0.27449685],
       dtype=float32))

In [8]:
bpr = implicit.bpr.BayesianPersonalizedRanking(
    factors=20,
    regularization=0.1,
    iterations=20
)

bpr.fit(user_item)

recs_bpr = bpr.recommend(user_idx, user_items[user_idx], N=5)
recs_bpr

  0%|          | 0/20 [00:00<?, ?it/s]

(array([5248, 4645, 2727, 5290, 1788], dtype=int32),
 array([0.32499725, 0.3002098 , 0.2868575 , 0.27449438, 0.24599528],
       dtype=float32))

3. Генерация рекомендаций:\
o Создайте список топ-N рекомендаций для нескольких
пользователей.\
o Для проверки выберите хотя бы 5 пользователей.

In [9]:
idx_to_prod = {i: pid for i, pid in enumerate(prod_ids)}

sample_user_idxs = [0, 25, 50, 75, 100]

for user_idx in sample_user_idxs:
    user_id = cust_ids[user_idx]
    print(f'\nРекомендации для пользователя {user_id} (idx={user_idx})')

    prod_indices_als, scores_als = als.recommend(
        user_idx, 
        user_items[user_idx], 
        N=3, 
        filter_already_liked_items=True
    )
    print('ALS топ-3:')
    for idx, score in zip(prod_indices_als, scores_als):
        prod_id = idx_to_prod.get(idx)
        desc = product_features.at[prod_id, 'description']
        print(f'  {prod_id} — {desc} (score={score:.3f})')

    prod_indices_bpr, scores_bpr = bpr.recommend(
        user_idx, 
        user_items[user_idx], 
        N=3, 
        filter_already_liked_items=True
    )
    print('BPR топ-3:')
    for idx, score in zip(prod_indices_bpr, scores_bpr):
        prod_id = idx_to_prod.get(idx)
        desc = product_features.at[prod_id, 'description']
        print(f'  {prod_id} — {desc} (score={score:.3f})')


Рекомендации для пользователя 12346.0 (idx=0)
ALS топ-3:
  48194 — DOORMAT HEARTS (score=0.601)
  48184 — DOOR MAT ENGLISH ROSE  (score=0.464)
  48129 — DOOR MAT TOPIARY (score=0.356)
BPR топ-3:
  C2 — CARRIAGE (score=0.325)
  85123A — WHITE HANGING HEART T-LIGHT HOLDER (score=0.300)
  23444 — Next Day Carriage (score=0.287)

Рекомендации для пользователя 12371.0 (idx=25)
ALS топ-3:
  23439 — HAND WARMER RED LOVE HEART (score=0.432)
  22699 — ROSES REGENCY TEACUP AND SAUCER  (score=0.368)
  21488 — RED WHITE SCARF  HOT WATER BOTTLE (score=0.364)
BPR топ-3:
  C2 — CARRIAGE (score=0.007)
  85123A — WHITE HANGING HEART T-LIGHT HOLDER (score=0.006)
  23444 — Next Day Carriage (score=0.006)

Рекомендации для пользователя 12396.0 (idx=50)
ALS топ-3:
  21080 — SET/20 RED SPOTTY PAPER NAPKINS  (score=0.448)
  84997B — RED 3 PIECE MINI DOTS CUTLERY SET (score=0.404)
  21094 — SET/6 RED SPOTTY PAPER PLATES (score=0.401)
BPR топ-3:
  C2 — CARRIAGE (score=0.142)
  85123A — WHITE HANGING HEART T-L

4. Оценка модели:\
o Оцените производительность модели, используя метрики NDCG,
precision@k и AUC для разных моделей.\
o Проверьте результаты как на обучающем, так и на тестовом наборах
данных

In [10]:
train_user_item, test_user_item = train_test_split(
    user_item, train_percentage=0.8
)

train_item_user = train_user_item.T.tocsr()

als_eval = implicit.als.AlternatingLeastSquares(
    factors=100,
    iterations=100
)
als_eval.fit(train_user_item)  

bpr_eval = implicit.bpr.BayesianPersonalizedRanking(
    factors=100,
    iterations=100
)

bpr_eval.fit(train_user_item)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [39]:
for model, name in [(als_eval, 'ALS'), (bpr_eval, 'BPR')]:
    p_tr = precision_at_k(model, train_user_item, train_user_item, K=5)
    nd_tr = ndcg_at_k(model, train_user_item, train_user_item, K=5)
    auc_tr = AUC_at_k(model, train_user_item, train_user_item, K=5305)

    p_te = precision_at_k(model, train_user_item, test_user_item,  K=5)
    nd_te = ndcg_at_k(model, train_user_item, test_user_item,  K=5)
    auc_te = AUC_at_k(model, train_user_item, test_user_item,  K=5305)

    print(f'{name}')
    print(f'train: precision@5 = {p_tr:.4f}, NDCG@5 = {nd_tr:.4f}, AUC@5305 = {auc_tr:.4f}')
    print(f'test: precision@5 = {p_te:.4f}, NDCG@5 = {nd_te:.4f}, AUC@5305 = {auc_te:.4f}')

  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

ALS
train: precision@5 = 0.0000, NDCG@5 = 0.0000, AUC@5305 = 0.0000
test: precision@5 = 0.2615, NDCG@5 = 0.2613, AUC@5305 = 0.8450


  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5904 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

  0%|          | 0/5555 [00:00<?, ?it/s]

BPR
train: precision@5 = 0.0000, NDCG@5 = 0.0000, AUC@5305 = 0.0000
test: precision@5 = 0.1572, NDCG@5 = 0.1577, AUC@5305 = 0.8649
