# <center> Разбор кейса ML-инженера

## Обучим и протестируем модель

In [1]:
#!pip install lightfm

In [2]:
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
import pickle

In [3]:
ratings = pd.read_csv('data/ratings.csv.zip')
books = pd.read_csv('data/books.csv.zip')
tags = pd.read_csv('data/tags.csv.zip')
book_tags = pd.read_csv('data/book_tags.csv.zip')

In [4]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
mapper = dict(zip(books.goodreads_book_id, books.book_id))

In [6]:
tags = pd.read_csv('data/tags_cleaned.csv.zip')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [7]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [8]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id, ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id, book_tags.tag_id)))

Объявим вспомогательные константы для обучения модели:

In [9]:
#число потоков нашего процессора
NUM_THREADS = 4
#число параметров вектора 
NUM_COMPONENTS = 60
#число эпох обучения
NUM_EPOCHS = 10 
#зерно датчика случайных чисел
RANDOM_STATE = 42

На этапе создания модели мы используем библиотеку LightFM, чтобы сделать матричное разложение (ALS) наших рейтингов книг и получить два набора векторов. 

In [10]:
# %%time
# #Разбиваем наш датасет на обучающую и тестовую выборки
# train, test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=RANDOM_STATE)

# #Создаём модель
# model = LightFM(
#                 learning_rate=0.05, #темп (скорость) обучения
#                 loss='warp', #loss-функция
#                 no_components=NUM_COMPONENTS,#размерность вектора признаков
#                 random_state=RANDOM_STATE #генератор случайных чисел
#                 )

# #Обучаем модель
# model = model.fit(
#                   train, #обучающая выборка
#                   epochs=NUM_EPOCHS, #количество эпох обучения
#                   num_threads=NUM_THREADS, #количество потоков процессора
#                   item_features=feature_ratings #признаки товаров (рейтинги книг)
#                   )

Протестируем модель

In [11]:
# %%time
# #Тестируем нашу модель
# precision_score = precision_at_k(
#                                  model, #модель
#                                  test, #тестовая выборка
#                                  num_threads=NUM_THREADS, #количество потоков процессора
#                                  k=10, #количество предложений
#                                  item_features=feature_ratings #признаки товаров
#                                  ).mean() #усредняем результаты
 
# recall_score = recall_at_k(
#                            model, #модель
#                            test, #тестовая выборка
#                            num_threads=NUM_THREADS, #количество потоков процессора
#                            k=10, #количество предложений
#                            item_features=feature_ratings #признаки товаров
#                            ).mean() #усредняем результаты

# print(recall_score, precision_score)

Сохраним модель

In [12]:
# with open('model.pkl', 'wb') as file:
#     pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

## Добавим эмбеддинги к модели и посмотрим, что получилось

In [13]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [14]:
# Достаём эбмеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

print(item_biases.shape, item_embeddings.shape)

(10001,) (10001, 60)


In [15]:
item_biases

array([0.        , 4.840424  , 4.7281427 , ..., 0.4853786 , 1.1874219 ,
       0.01927405], dtype=float32)

In [16]:
item_embeddings

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.3588437 , -0.39306548, -0.4481972 , ...,  1.3005738 ,
         0.78044146, -0.06962232],
       [-1.7334272 , -0.21814057, -0.9686587 , ...,  1.3776436 ,
         0.2857782 ,  0.06638955],
       ...,
       [-0.81708235, -0.25210136,  0.19621253, ...,  1.3499578 ,
         0.35253292, -0.06085279],
       [-0.59409636, -0.02994887, -1.0355613 , ...,  1.6425117 ,
         0.21685681,  0.03149126],
       [-0.4253026 ,  0.0124193 , -0.18086599, ...,  0.98617804,
         0.6333476 , -0.02541547]], dtype=float32)

In [17]:
#!pip install nmslib

In [18]:
import nmslib

In [19]:
#Инициализируем наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


In [20]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

Найдем id книги 1984

In [21]:
#Отфильтруем только те, где в названии встречается подстрока "1984"
books[['book_id', 'title']][books['title'].apply(lambda x: x.lower().find('1984')) >= 0]

Unnamed: 0,book_id,title
12,13,1984
845,846,Animal Farm / 1984
9795,9796,A Kiss for Little Bear (An I Can Read Book) by...


In [22]:
#Отфильтруем только те, где в названии встречается подстрока "animal"
books[['book_id', 'title']][books['title'].apply(lambda x: x.lower().find('animal')) >= 0]

Unnamed: 0,book_id,title
13,14,Animal Farm
845,846,Animal Farm / 1984
1020,1021,"Animal, Vegetable, Miracle: A Year of Food Life"
1751,1752,Animal Dreams
2289,2290,Eating Animals
4637,4638,"My Family and Other Animals (Corfu Trilogy, #1)"
5397,5398,"Fables, Vol. 2: Animal Farm"
5417,5418,"The Social Animal: The Hidden Sources of Love,..."
5764,5765,The Third Chimpanzee: The Evolution and Future...
6701,6702,Animalia


In [23]:
books['title'][books['book_id']==846]

845    Animal Farm / 1984
Name: title, dtype: object

In [24]:
books['title'][books['book_id']==6702]

6701    Animalia
Name: title, dtype: object

Теперь найдем все похожие книги и посмотрим на них

In [25]:
#Вызываем функцию для поиска ближайших соседей
print(nearest_books_nms(846, nms_idx))
print(nearest_books_nms(6702, nms_idx))

(array([ 846,   55,   14,  809,   48,   13,  903,  173,  271, 8140],
      dtype=int32), array([0.        , 0.03338528, 0.04415029, 0.05182219, 0.05777216,
       0.0650335 , 0.0756126 , 0.07580829, 0.0850451 , 0.09076774],
      dtype=float32))
(array([6702,  102, 3698,   50,  278,  888, 6724, 3047, 5071, 3274],
      dtype=int32), array([0.        , 0.05087888, 0.05495286, 0.05724448, 0.06035101,
       0.06153738, 0.06178409, 0.06611544, 0.06732404, 0.06790745],
      dtype=float32))


In [26]:
#Выделяем идентификаторы рекомендованных книг
nbm_1 = nearest_books_nms(846, nms_idx)[0]
nbm_1

array([ 846,   55,   14,  809,   48,   13,  903,  173,  271, 8140],
      dtype=int32)

In [27]:
nbm_2 = nearest_books_nms(6702, nms_idx)[0]
nbm_2

array([6702,  102, 3698,   50,  278,  888, 6724, 3047, 5071, 3274],
      dtype=int32)

In [28]:
#Посмотрим на авторов и названия рекомендованных книг
books[books.book_id.isin(nbm_1)][['authors', 'title']]

Unnamed: 0,authors,title
12,"George Orwell, Erich Fromm, Celâl Üster",1984
13,George Orwell,Animal Farm
47,Ray Bradbury,Fahrenheit 451
54,Aldous Huxley,Brave New World
172,Anthony Burgess,A Clockwork Orange
270,Daniel Keyes,Flowers for Algernon
808,"Aldous Huxley, Christopher Hitchens",Brave New World / Brave New World Revisited
845,"George Orwell, Christopher Hitchens",Animal Farm / 1984
902,Ayn Rand,Anthem
8139,Aldous Huxley,Brave New World Revisited


In [29]:
books[books.book_id.isin(nbm_2)][['authors', 'title']]

Unnamed: 0,authors,title
49,Shel Silverstein,Where the Sidewalk Ends
101,Maurice Sendak,Where the Wild Things Are
277,Shel Silverstein,A Light in the Attic
887,Crockett Johnson,Harold and the Purple Crayon
3046,Dr. Seuss,Fox in Socks
3273,Dr. Seuss,Hop On Pop
3697,Blanche Fisher Wright,The Real Mother Goose
5070,"Julia Donaldson, Axel Scheffler",The Gruffalo
6701,Graeme Base,Animalia
6723,Eric Carle,The Mixed-Up Chameleon


Сохраним эмбеддинги

In [30]:
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

In [31]:
!python --version

Python 3.8.20


In [32]:
# scipy==1.10.1
# lightfm==1.17
# nmslib==2.1.1