In [2]:
# !pip install lightfm

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py): started
  Building wheel for lightfm (setup.py): finished with status 'done'
  Created wheel for lightfm: filename=lightfm-1.16-cp38-cp38-win_amd64.whl size=430460 sha256=dba121c6845fcf6977f3136083c90c01ab731c5f6899f207d4cc30ab6e5bd3c0
  Stored in directory: c:\users\andrey\appdata\local\pip\cache\wheels\ec\bb\51\9c487d021c1373b691d13cadca0b65b6852627b1f3f43550fa
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [7]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
import pickle 
    
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
book_tags = pd.read_csv('data/book_tags.csv')

tags_cleaned = pd.read_csv('data/tags_cleaned.csv')

Если мы откроем файл с тегами tags.csv, то увидим, что там очень много неинформативных тегов:

В нашем случае теги очень сильно влияют на качество модели. Что я сделал:

Оставил около 500 наиболее популярных тегов.
Вручную сгруппировал оставшиеся теги.

In [45]:
#tags_cleaned.tag_name.unique()

In [2]:
# создадим словарь книжных id
mapper = dict(zip(books.goodreads_book_id,books.book_id))

Теперь применим этот словарь, чтобы добавить id книги в dataframe book_tags.

In [3]:
tags = pd.read_csv('data/tags_cleaned.csv')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [4]:
book_tags

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27
...,...,...,...,...
999877,33288638,9886,10,8892
999879,33288638,3358,10,8892
999880,33288638,1679,10,8892
999889,33288638,1659,9,8892


Чтобы работать с моделями в библиотеке LightFm, нам нужно создать разреженные матрицы. Мы будем хранить данные в формате COO (координатный формат представления данных). Вместо хранения всех значений, которые включают нулевые значения, мы будем хранить только ненулевые значения. В COO данные представлены в виде (строка, столбец, значение).

In [5]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))

In [6]:
ratings_coo

<53425x10001 sparse matrix of type '<class 'numpy.int64'>'
	with 5976479 stored elements in COOrdinate format>

In [17]:
# Далее объявляем вспомогательные константы для обучения модели. 

#число потоков нашего процессора
NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 30 

#число эпох обучения
NUM_EPOCHS = 10 

In [18]:
#Разбиваем наш датасет на обучающую и тестовую выборки
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

На этапе создания модели мы используем библиотеку LightFM, чтобы сделать матричное разложение (ALS) наших рейтингов книг и получить два набора векторов. 

In [29]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

https://arxiv.org/pdf/1507.08439.pdf

Задание 7.1
2/2 points (graded)
Соотнесите описание метрики с её названием. *Рекомендуемый товар — это товар, который рекомендует система пользователю. Релевантный товар — это товар, который пользователю действительно понравился и в обучающей/тестовой выборке у него высокий рейтинг.

precision at k = 
(число рекомендуемых товаров, которые релевантны, при условии, что мы рекомендуем ровно k вещей) / (число рекомендованных товаров)


recall at k =  
(число рекомендуемых товаров, при условии, что мы рекомендуем ровно k вещей) / (число релевантных товаров)

In [12]:
# load the model from disk
filename = 'model0.sav'
model = pickle.load(open(filename, 'rb'))

In [13]:
# save the model to disk
# filename = 'model0.sav'
# pickle.dump(model, open(filename, 'wb'))

In [19]:
#Тестируем нашу модель
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_at_k,prec_score)

0.03829674356016953 0.08309001


In [None]:
0.03736046416037497 0.08130851

In [20]:
# Достаём эбмеддинги

item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [21]:
# https://github.com/nmslib/nmslib
# !pip install nmslib

Collecting nmslib
  Downloading nmslib-2.1.1-cp38-cp38-win_amd64.whl (661 kB)
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [22]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [23]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

9. Проверяем модель

Давайте попробуем написать рекомендации к какой-нибудь книге. Например, к роману «1984» Джорджа Оруэлла.

In [24]:
books[books.original_title.str.find('1984')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


Видим, что у книги id — 846.

Ищем похожие книги.

In [26]:
nbm = nearest_books_nms(846,nms_idx)[0]

#Выводим похожие книги.
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
13,14,7613,7613,2207778,896,452284244,9780452000000.0,George Orwell,1945.0,Animal Farm: A Fairy Story,...,1881700,1982987,35472,66854,135147,433432,698642,648912,https://images.gr-assets.com/books/1424037542m...,https://images.gr-assets.com/books/1424037542s...
47,48,4381,4381,1272463,507,307347974,9780307000000.0,Ray Bradbury,1953.0,Fahrenheit 451,...,570498,1176240,30694,28366,64289,238242,426292,419051,https://images.gr-assets.com/books/1351643740m...,https://images.gr-assets.com/books/1351643740s...
54,55,5129,5129,3204877,515,60929871,9780061000000.0,Aldous Huxley,1932.0,Brave New World,...,1022601,1079135,20095,26367,60328,219895,389379,383166,https://images.gr-assets.com/books/1487389574m...,https://images.gr-assets.com/books/1487389574s...
78,79,1381,1381,3356006,1703,143039954,9780143000000.0,"Homer, Robert Fagles, E.V. Rieu, Frédéric Mugl...",-720.0,Ὀδύσσεια,...,670326,710757,8101,29703,65629,183082,224120,208223,https://images.gr-assets.com/books/1390173285m...,https://images.gr-assets.com/books/1390173285s...
340,341,1371,1371,3293141,1726,140275363,9780140000000.0,"Homer, Robert Fagles, Frédéric Mugler, Bernard...",-750.0,Ἰλιάς,...,241088,273565,4763,7701,20845,68844,89384,86791,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
374,375,1852,1852,3252320,1384,439227143,9780439000000.0,Jack London,1903.0,The Call of the Wild,...,223932,248795,6770,6366,16636,62853,90382,72558,https://images.gr-assets.com/books/1452291694m...,https://images.gr-assets.com/books/1452291694s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
976,977,15645,15645,2377563,856,812970063,9780813000000.0,"Dante Alighieri, Anthony M. Esolen",1320.0,Inferno,...,87511,109200,3576,1930,6267,23308,37680,40015,https://images.gr-assets.com/books/1333579470m...,https://images.gr-assets.com/books/1333579470s...
2141,2142,1375,1375,1474309,255,147712556,9780148000000.0,"Homer, Robert Fagles, Bernard Knox",-762.0,Ἰλιάς ; Ὀδύσσεια,...,47825,51098,537,916,2608,10439,17404,19731,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
8139,8140,5481,5481,16335101,80,60898526,9780061000000.0,Aldous Huxley,1958.0,Brave New World Revisited,...,11073,12286,714,231,691,2765,4567,4032,https://images.gr-assets.com/books/1410136964m...,https://images.gr-assets.com/books/1410136964s...


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


208     The Silence of the Lambs
430                   Red Dragon
767               Shutter Island
981                 The Alienist
1484            The Black Dahlia
1801                    Hannibal
3405              The Dante Club
4421             Hannibal Rising
5312               A Simple Plan
6495       The Angel of Darkness
Name: original_title, dtype: object

In [37]:
# save the model to disk
with open('model0.pickle', 'wb') as file:
    pickle.dump(model, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
with open('item_embeddings.pickle', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

# start here

literature
https://www.youtube.com/watch?v=9gBC9R-msAk

    https://github.com/nmslib/nmslib
    
        https://aws.amazon.com/ru/about-aws/whats-new/2020/03/build-k-nearest-neighbor-similarity-search-engine-with-amazon-elasticsearch-service
        
            https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/
            
                http://building-babylon.net/2016/01/26/metadata-embeddings-for-user-and-item-cold-start-recommendations/
                
                    https://github.com/nmslib/nmslib
                    
                        https://arxiv.org/pdf/1908.08328.pdf
                        

In [42]:
# load the model from disk
with open('model0.pickle', 'rb') as file:
    model = pickle.load(file)

In [41]:
# load the model from disk
with open('item_embeddings.pickle', 'rb') as file:
    item_embeddings = pickle.load(file)

In [43]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [44]:
books[books.original_title.str.find('The Silence of the Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [45]:
nbm = nearest_books_nms(209,nms_idx)[0]

#Выводим похожие книги.
books[books.book_id.isin(nbm)].original_title

208     The Silence of the Lambs
430                   Red Dragon
767               Shutter Island
981                 The Alienist
1484            The Black Dahlia
1801                    Hannibal
3405              The Dante Club
4421             Hannibal Rising
5312               A Simple Plan
6495       The Angel of Darkness
Name: original_title, dtype: object