In [1]:
from scipy.spatial.distance import cdist
import numpy as np

import numpy as np
import scipy
import pandas as pd

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from sklearn.model_selection import train_test_split
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


#### User-based фильтрация

In [3]:
demo_data = np.array([[5, 4, 4, 3, 5], [3, 4, 2, 5, 3], [2, 1, 2, 2, 5], [2, 0, 3, 1, 3],])
misha = np.array([[0, 4, 4, 4, 5]])
cdist(demo_data, misha, metric='cosine')

array([[0.15342203],
       [0.12999789],
       [0.14560384],
       [0.24345225]])

In [4]:
(0.13/0.27)*5

2.4074074074074074

In [5]:
(0.14/0.27)*5

2.5925925925925926

#### Матричная факторизация и факторизационные машины

In [4]:
ratings = pd.read_csv("data/ml-100k/u.data", sep="\t", header=None)
ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings.sort_values('timestamp', inplace=True)
ratings['score'] = (ratings['rating'] > 2).apply(int)

In [5]:
train, test = train_test_split(ratings, test_size=0.2, shuffle=False)

In [6]:
train_pivot = pd.pivot_table(
    train,
    index="user_id", 
    columns="item_id", 
    values="score"
)
test_pivot = pd.pivot_table(
    test,
    index="user_id", 
    columns="item_id", 
    values="score"
)

print(train_pivot.shape)
print(test_pivot.shape)

(751, 1616)
(301, 1448)


In [7]:
shell = pd.pivot_table(
    ratings, 
    index="user_id", 
    columns="item_id", 
    values="score", 
    aggfunc=lambda x: 0
)
shell.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0.0,,,,,,,,,0.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.0,0.0,,,,,,,,,...,,,,,,,,,,


In [8]:
train_pivot = shell + train_pivot
test_pivot = shell + test_pivot

train_pivot = (train_pivot + 1).fillna(0)
test_pivot = (test_pivot + 1).fillna(0)
print(train_pivot.shape)
print(test_pivot.shape)
## (943, 1682)
## (943, 1682)

train_pivot.head()

(943, 1682)
(943, 1682)


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,2.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
train_pivot_sparse = scipy.sparse.csr_matrix(train_pivot.values)
test_pivot_sparse = scipy.sparse.csr_matrix(test_pivot.values)

In [17]:
model = AlternatingLeastSquares(factors=10, random_state=42)
model.fit(train_pivot_sparse)

100%|██████████| 15/15 [00:00<00:00, 86.34it/s]


In [18]:
unique_items = np.array(train_pivot.columns)
user_id = 14
recomendations_ids, scores = model.recommend(user_id, train_pivot_sparse[user_id])
recomendations = unique_items[recomendations_ids]
print('Recomendations ids: {}'.format(recomendations_ids))
print('Recomendations for user {}: {}'.format(user_id, recomendations))

Recomendations ids: [293 116 244 275 287  99 283 312 596 150]
Recomendations for user 14: [294 117 245 276 288 100 284 313 597 151]


In [None]:
map_at10 = mean_average_precision_at_k(model, train_pivot_sparse, test_pivot_sparse, K=10)
print('Mean Average Precision at 10: {:.3f}'.format(map_at10))

In [13]:
model = LightFM(no_components=10, loss='logistic', random_state=42)
model.fit(train_pivot_sparse, epochs=30)

<lightfm.lightfm.LightFM at 0x1b089a4f2d0>

Чтобы получить сами рекомендации, необходимо умножить эти веса на -1 и отсортировать их индексы по возрастанию веса. Нам нужны будут только индексы 10 фильмов с наибольшим по модулю весом. Обратившись по полученным индексам к списку идентификаторов фильмов, мы получим рекомендации для конкретного пользователя:

In [14]:
item_ids = np.arange(0, train_pivot_sparse.shape[1])
list_pred = model.predict(user_id, item_ids)
recomendations_ids = np.argsort(-list_pred)[:10]
recomendations = unique_items[recomendations_ids]
print('Recomendations for user {}: {}'.format(user_id, recomendations))

Recomendations for user 14: [ 50 294 258 100 181 288 286   1 300 121]


In [15]:
map_at10 = precision_at_k(model, test_pivot_sparse, k=10).mean()
print('Mean Average Precision at 10: {:.2f}'.format(map_at10))

Mean Average Precision at 10: 0.32
