# Вебинар 5. Ранжирование и Гибридные рекоммендательные системы

# 2. LightFM

In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m163.8/316.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808329 sha256=deb2b912a789b4f2d5482b62289beab3b7b5314da3e76c5e0cd145d367ddadb3
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [2]:
!pip install implicit==0.6.2

Collecting implicit==0.6.2
  Downloading implicit-0.6.2-cp310-cp310-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [6]:
from utilis import prefilter_items, prefilter_items_v2

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

In [24]:
data = pd.read_csv('retail_train.csv')

item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')


# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]


data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [25]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [26]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


## 1. Filter items

In [27]:
n_items_before_train = data_train['item_id'].nunique()
n_items_before_test = data_test['item_id'].nunique()
data_train = prefilter_items(data_train, take_n=5000)
data_test= prefilter_items(data_test, take_n=5000)

n_items_after_train = data_train['item_id'].nunique()
n_items_after_test = data_test['item_id'].nunique()
print('Decreased # items from {} to {} for train'.format(n_items_before_train, n_items_after_train))
print('Decreased # items from {} to {} for test'.format(n_items_before_test, n_items_after_test))

Decreased # items from 50645 to 5001 for train
Decreased # items from 21810 to 5001 for test


## 2. Prepare data set

In [160]:
user_item_matrix = pd.pivot_table(data_train,
                                  index='user_id', columns='item_id',
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
#user_item_matrix[user_item_matrix > 0] = 1 # важен сам факт наличия покупки а не количество покупок

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,202291,397896,420647,480014,818980,818981,819063,819255,819302,819304,...,12384365,12384657,12384775,12385340,12425418,12428017,12428436,12524245,12524690,12811532
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [161]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test,
                                  index='user_id', columns='item_id',
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count',
                                  fill_value=0
                                 )
#test_user_item_matrix[test_user_item_matrix > 0] = 1
test_user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
test_user_item_matrix.head(2)

item_id,202291,397896,420647,480014,818980,818981,819063,819255,819302,819304,...,12384365,12384657,12384775,12385340,12425418,12428017,12428436,12524245,12524690,12811532
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


**Словари связывающие реальные данные с порядковыми номерами np матрицы**

In [162]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

## 3. Prepare user and item features

In [163]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [164]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [165]:
user_feat_lightfm.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Ручной решетчатый поиск

In [108]:
%%time
loss_list = ['bpr', 'warp']
components_list = [40, 60, 100]
metrics_dict = {}
for l in loss_list:
  for c in components_list:
      model = LightFM(no_components=c,
                    loss=l,
                    learning_rate=0.01,
                    item_alpha=0.4,
                    user_alpha=0.1,
                    random_state=42,
                    k=5,
                    n=15,
                    max_sampled=100)

      model.fit(sparse_user_item,  # user-item matrix из 0 и 1
              sample_weight=coo_matrix(user_item_matrix),
              user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
              item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
              epochs=15,
              num_threads=4,
              verbose=False)

      test_precision = precision_at_k(model, csr_matrix(test_user_item_matrix).tocsr(),
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

      metrics_dict[l,c] = test_precision
print(metrics_dict)



{('bpr', 40): 0.0044871797, ('bpr', 60): 0.12443912, ('bpr', 100): 0.005048077, ('warp', 40): 0.24591348, ('warp', 60): 0.20208335, ('warp', 100): 0.19831732}
CPU times: user 9min 22s, sys: 641 ms, total: 9min 23s
Wall time: 6min 14s


In [109]:
print(sorted(zip(metrics_dict.values(),metrics_dict.keys()),reverse = True)[0])

(0.24591348, ('warp', 40))


In [166]:
model = LightFM(no_components=40,
                    loss='warp',
                    learning_rate=0.01,
                    item_alpha=0.4,
                    user_alpha=0.1,
                    random_state=42,
                    k=5,
                    n=15,
                    max_sampled=100)


In [178]:
model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
              sample_weight=coo_matrix(user_item_matrix),
              user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
              item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
              epochs=15,
              num_threads=4,
              verbose=True)

Epoch: 100%|██████████| 15/15 [00:28<00:00,  1.88s/it]


<lightfm.lightfm.LightFM at 0x786034b1bca0>

In [142]:
user_emb = model.get_user_representations(features=csr_matrix(user_feat_lightfm.values).tocsr())

In [143]:
user_emb, user_emb[0].shape, user_emb[1].shape

((array([-0.315238  ,  0.        ,  0.        , ..., -0.32957774,
         -0.24568236,  0.        ], dtype=float32),
  array([[-1.1861866e-05,  2.8043104e-08, -1.0252613e-08, ...,
           1.1703054e-09, -4.0857401e-07, -2.1954282e-08],
         [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
           0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
         [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
           0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
         ...,
         [-1.3060608e-05,  3.0310442e-08, -1.0811830e-08, ...,
           1.2957356e-09, -4.5754382e-07, -2.4312836e-08],
         [-7.7045473e-07,  6.2416333e-10,  3.0542027e-10, ...,
          -1.0749357e-10,  3.1721658e-08,  1.0876396e-09],
         [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
           0.0000000e+00,  0.0000000e+00,  0.0000000e+00]], dtype=float32)),
 (2496,),
 (2496, 40))

In [144]:
item_emb = model.get_item_representations(features=csr_matrix(item_feat_lightfm.values).tocsr())

In [145]:
item_emb, item_emb[0].shape, item_emb[1].shape

((array([-3.0758238e-06,  3.8018410e-05, -3.0758238e-06, ...,
          2.0893502e-05,  1.0128184e-05, -1.3325303e-05], dtype=float32),
  array([[ 2.3089711e-07, -5.1236260e-10,  1.7356631e-10, ...,
          -1.6733238e-11,  7.6090032e-09,  4.2742504e-10],
         [ 2.2264797e-07, -4.8886872e-10,  1.6517118e-10, ...,
          -1.5879792e-11,  7.4326181e-09,  4.0222475e-10],
         [ 2.3089711e-07, -5.1236260e-10,  1.7356631e-10, ...,
          -1.6733238e-11,  7.6090032e-09,  4.2742504e-10],
         ...,
         [ 1.4686863e-07, -3.5726636e-10,  1.0657293e-10, ...,
          -1.0707986e-11,  4.8391007e-09,  2.8082553e-10],
         [ 3.0179990e-07, -5.6501903e-10,  1.7133021e-10, ...,
          -1.7700234e-11,  7.4327584e-09,  4.3794135e-10],
         [ 1.0190236e-07, -1.3126007e-10,  6.9719480e-11, ...,
          -1.2078865e-11,  1.5502727e-09,  1.1379999e-10]], dtype=float32)),
 (5001,),
 (5001, 40))

In [197]:
items_trst = np.array(range(5001))
predictions = model.predict(user_ids=2495, item_ids=items_trst,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
predictions

array([-0.01544058,  0.2394703 , -0.01544058, ..., -0.08251069,
       -0.08058992,  0.06210327], dtype=float32)

In [194]:
items_trst = np.array(range(5001))
predictions = model.predict(user_ids=900, item_ids=items_trst,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
predictions
predictions.argsort()[::-1][:5]
[id_to_itemid[ind] for ind in predictions.argsort()[::-1][:5]]

[1082185, 1055646, 854852, 979707, 1033142]

In [193]:
items_trst = np.array(range(5001))
predictions = model.predict(user_ids=0, item_ids=items_trst,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
predictions
predictions.argsort()[::-1][:5]
[id_to_itemid[ind] for ind in predictions.argsort()[::-1][:5]]

[1082185, 1055646, 854852, 981760, 840361]

In [180]:
predictions.argsort()[::-1][:5]
[id_to_itemid[ind] for ind in predictions.argsort()[::-1][:5]]

[1082185, 1055646, 854852, 979707, 1033142]

In [181]:
predictions = model.predict(user_ids=2495, item_ids=items_trst,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
predictions

array([-0.01544058,  0.2394703 , -0.01544058, ..., -0.08251069,
       -0.08058992,  0.06210327], dtype=float32)

In [136]:
%%time
items_trst = np.array([el for el in id_to_itemid.keys()])
users = [int(el) for el in userids]

result = {}
for user in range(5):
  predictions = model.predict(user_ids=user, item_ids=items_trst,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
  result[user] = [id_to_itemid[ind] for ind in predictions.argsort()[::-1][:5]]

CPU times: user 434 ms, sys: 2.02 ms, total: 436 ms
Wall time: 437 ms


In [137]:
result

{0: [1082185, 981760, 840361, 1055646, 854852],
 1: [1082185, 981760, 840361, 1055646, 854852],
 2: [1082185, 981760, 840361, 1055646, 854852],
 3: [1082185, 981760, 840361, 1055646, 854852],
 4: [1082185, 981760, 840361, 1055646, 854852]}

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[856942, 986947, 1082185, 859191, 861272, 9343..."
1,2,"[847241, 904236, 911974, 940947, 957013, 10012..."
