<h3><b>Импорт используемых библиотек</b></h3>

In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from scipy import sparse

<h3><b>Загрузка датасета</b></h3>
датасет можно скачать: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset

In [29]:
book_df = pd.read_csv('data/Books.csv')
ratings_df = pd.read_csv('data/Ratings.csv')
user_df = pd.read_csv('data/Users.csv')
user_ratings_df = ratings_df.merge(user_df, left_on = 'User-ID', right_on = 'User-ID')

  exec(code_obj, self.user_global_ns, self.user_ns)


<h3><b>Чистка данных</b></h3>
Т.к. книги, имеющие мало оценок и пассивные пользователи, не ставящие оценки практически не будут особо влиять
на рекомендации, то ставляем только книги с 10-ю оценками и пользователей, которые дали хотя бы 10 оценок.
Также таким  образом облегчается нагрузка при постройки разреженной матрицы.

In [3]:
book_counts = user_ratings_df['ISBN'].value_counts()
popular_books = list(book_counts[book_counts >= 10].index)

ratings_count = user_ratings_df['User-ID'].value_counts()
users_with_many_ratings = list(ratings_count[ratings_count >= 10].index)

user_ratings_df = user_ratings_df[(user_ratings_df['ISBN'].isin(popular_books)) & 
                             user_ratings_df['User-ID'].isin(users_with_many_ratings)]
user_ratings_df.sort_values('User-ID', ascending=True)

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age
9563,8,0002005018,5,"timmins, ontario, canada",
9565,8,0374157065,0,"timmins, ontario, canada",
9567,8,0399135782,0,"timmins, ontario, canada",
9683,99,0786868716,0,"franktown, colorado, usa",42.0
9681,99,067976397X,0,"franktown, colorado, usa",42.0
...,...,...,...,...,...
9536,278851,0440486599,5,"dallas, texas, usa",33.0
9537,278851,0553211439,0,"dallas, texas, usa",33.0
9538,278851,0553277375,0,"dallas, texas, usa",33.0
9545,278851,0894803700,5,"dallas, texas, usa",33.0


In [4]:
book_user_rating = book_df.merge(user_ratings_df, left_on = 'ISBN',right_on = 'ISBN')
book_user_rating = book_user_rating[['ISBN', 'Book-Title', 'Book-Author', 'User-ID', 'Book-Rating']]
book_user_rating.reset_index(drop=True, inplace = True)
book_user_rating.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,User-ID,Book-Rating
0,2005018,Clara Callan,Richard Bruce Wright,8,5
1,2005018,Clara Callan,Richard Bruce Wright,11400,0
2,2005018,Clara Callan,Richard Bruce Wright,11676,8
3,2005018,Clara Callan,Richard Bruce Wright,41385,0
4,2005018,Clara Callan,Richard Bruce Wright,67544,8


Подготавливаем датафрейм для создания разреженной матрицы

In [5]:
d = {}
for i, j in enumerate(book_user_rating.ISBN.unique()):
    d[j] = i
book_user_rating['book_unique_id'] = book_user_rating['ISBN'].map(d)
book_user_rating.reset_index(drop=True, inplace=True)
book_user_rating.head()


Unnamed: 0,ISBN,Book-Title,Book-Author,User-ID,Book-Rating,book_unique_id
0,2005018,Clara Callan,Richard Bruce Wright,8,5,0
1,2005018,Clara Callan,Richard Bruce Wright,11400,0,0
2,2005018,Clara Callan,Richard Bruce Wright,11676,8,0
3,2005018,Clara Callan,Richard Bruce Wright,41385,0,0
4,2005018,Clara Callan,Richard Bruce Wright,67544,8,0


Строим разреженную матрицу

In [6]:
df_pivot = book_user_rating.pivot(index='book_unique_id',
                                  columns='User-ID',
                                  values='Book-Rating').fillna(0)

In [7]:
df_pivot.head()

User-ID,8,99,160,242,243,254,383,384,388,408,...,278535,278536,278554,278563,278582,278633,278637,278771,278843,278851
book_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df_pivot.shape


(17479, 12617)

Строится вариант разреженной матрицы: Сжатое хранение строкой (CSR — Compressed Sparse Row) 

In [9]:
df_sparse = sparse.csr_matrix(df_pivot)
df_pivot.reset_index(inplace=True)

Рекоммендательная система будет работать на коллаборативной фильтрации по книге
т.е. основываясь на оценке книги, которую дал пользователь будем искать пользователей, давших такую же оценку,
и рекоммендовать книги которые высоко оценили они.
Для этого используем метод ближайших соседей, метрикой будет косинусное расстояние.

In [10]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(df_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [12]:
def get_book_recommendation(book, n_recommend: int):
    '''На вход даётся df с одной книгой для неё находим ближайших соседей,
       и ранжируем по косинусному расстоянию
       на выход возвращаем df содержащий n рекоммендаций'''
    book_unique_id = book['book_unique_id']
    distances, indices = knn.kneighbors(df_sparse[book_unique_id], n_neighbors=n_recommend)
    rec_book_indices = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())),
                                key=lambda x: x[1])[:0:-1]
    recommendations = []
    for val in rec_book_indices:
        book_idx = book_user_rating.iloc[val[0]]['ISBN']
        idx = book_df[book_df['ISBN'] == book_idx].index
        recommendations.append({'ISBN':book_df.iloc[idx]['ISBN'].values[0],
                                'Book-Title':book_df.iloc[idx]['Book-Title'].values[0],
                                'Book-Author':book_df.iloc[idx]['Book-Author'].values[0],
                                'Year-Of-Publication':book_df.iloc[idx]['Year-Of-Publication'].values[0],
                                'Publisher':book_df.iloc[idx]['Publisher'].values[0],
                                'Distance':val[1]})
    df = pd.DataFrame(recommendations, index=range(1, n_recommend))
    return df

In [25]:
def recommend(user_id: int, n_recommend=5):
    '''На вход подаётся id пользователя и желаемое число рекоммендаций'''
    user_df = book_user_rating[book_user_rating['User-ID'] == user_id] #Находим все книги, которые оценивал пользователь
    max_rating = user_df['Book-Rating'].max() 
    max_rated_book = user_df[user_df['Book-Rating'] == max_rating] #На основе самой высокооценённой книги 
                                                                   #будем выдавать рекоммендации
    recommendations = get_book_recommendation(max_rated_book, n_recommend+1)

    return recommendations

In [27]:
recommend(8, n_recommend = 10)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Distance
1,0060987103,Wicked: The Life and Times of the Wicked Witch...,Gregory Maguire,1996,Regan Books,0.552086
2,0425184226,The Sum of All Fears,Tom Clancy,2002,Berkley Publishing Group,0.552086
3,155874262X,Chicken Soup for the Soul (Chicken Soup for th...,Jack Canfield,1993,Health Communications,0.552086
4,038572179X,Atonement : A Novel,IAN MCEWAN,2003,Anchor,0.552086
5,0385416342,The Firm,John Grisham,1991,Doubleday Books,0.552086
6,3257233051,Veronika Deschliesst Zu Sterben / Vernika Deci...,Paolo Coelho,2002,Distribooks,0.552086
7,0142001740,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books,0.552086
8,0060188731,Bel Canto,Ann Patchett,2001,HarperCollins Publishers,0.552086
9,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,0.552086
10,0451526341,Animal Farm,George Orwell,2004,Signet,0.33551


В данном примере невозможно оценить результат работы рекоммендательной системы, однако в реальной жизни можно было бы проверить эффективность системы с помощью A/B тестов
<p>Основные минусы такого подхода:
<ul>
    <li>Масштабируемость: чем больше пользователей и книг в базе тем больше времени будет требоваться для рекоммендаций
    <li>Проблема холодного старта: подбор рекоммендаций для пользователей или книг с нулём оценок довольно проблематично. В данном случае резонно использовать рекоммендации популярных или новых книг.