In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
from pathlib import Path

In [2]:
#число потоков нашего процессора
NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 30 

#число эпох обучения
NUM_EPOCHS = 10 

In [3]:
ratings = pd.read_csv(Path.cwd() / 'data' / 'ratings.csv')
books = pd.read_csv(Path.cwd() / 'data' / 'books.csv')
tags = pd.read_csv(Path.cwd() / 'data' / 'tags.csv')
book_tags = pd.read_csv(Path.cwd() / 'data' / 'book_tags.csv')

In [4]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

In [5]:
tags = pd.read_csv(Path.cwd() / 'data' / 'tags_cleaned.csv')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [6]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))

In [7]:
feature_ratings.shape

(10001, 33269)

In [9]:
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)

In [10]:
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

In [11]:
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

In [12]:
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

In [13]:
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_at_k,prec_score)

0.03690568130684941 0.08045786


In [14]:
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [15]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [16]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [17]:
books[books.original_title.str.find('1984')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [18]:
nbm = nearest_books_nms(846,nms_idx)[0]

In [20]:
books[books.original_title.str.find('Silence of the Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [21]:
nbm2 = nearest_books_nms(209,nms_idx)[0]

In [22]:
books[books.book_id.isin(nbm2)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...
430,431,28877,28877,925503,191,525945563,9780526000000.0,Thomas Harris,1981.0,Red Dragon,...,194013,205433,3309,3012,7790,43235,80662,70734,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
767,768,21686,21686,1234227,134,038073186X,9780381000000.0,Dennis Lehane,2003.0,Shutter Island,...,113718,124032,6990,1636,4727,22089,49875,45705,https://images.gr-assets.com/books/1329269081m...,https://images.gr-assets.com/books/1329269081s...
981,982,40024,40024,2266643,70,812976142,9780813000000.0,Caleb Carr,1994.0,The Alienist,...,96981,100908,4026,1798,4571,18715,37572,38252,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
1484,1485,21704,21704,434,93,446698873,9780447000000.0,James Ellroy,1987.0,The Black Dahlia,...,61412,65404,1944,1999,4999,17641,23340,17425,https://images.gr-assets.com/books/1387048173m...,https://images.gr-assets.com/books/1387048173s...
1801,1802,32418,32418,2992500,132,99297701,9780099000000.0,Thomas Harris,1999.0,Hannibal,...,57569,63555,2098,2166,5811,17220,20844,17514,https://images.gr-assets.com/books/1327356556m...,https://images.gr-assets.com/books/1327356556s...
4421,4422,32416,32416,46673,94,385339410,9780385000000.0,Thomas Harris,2006.0,Hannibal Rising,...,22767,25973,1317,1468,3733,8087,7174,5511,https://images.gr-assets.com/books/1394208690m...,https://images.gr-assets.com/books/1394208690s...
5312,5313,21727,21727,593515,46,307279952,9780307000000.0,Scott B. Smith,1993.0,A Simple Plan,...,18628,19650,986,478,1086,4239,7690,6157,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
6258,6259,22026,22026,812174,94,345441702,9780345000000.0,Mario Puzo,1984.0,The Sicilian,...,13800,15057,483,114,705,3708,5956,4574,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
