In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import random
from surprise import dump
from surprise import SVD
from surprise import Dataset, Reader
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_books = pd.read_csv('/.spyder-py3/files/input/books.csv')
df_books = df_books.drop_duplicates(subset = ['original_title'])
df_books = df_books.dropna(subset = ['original_title'])

In [3]:
df_ratings = pd.read_csv('/.spyder-py3/files/input/ratings.csv')
df_ratings = df_ratings.drop_duplicates()
df_ratings = df_ratings.dropna()

In [4]:
df_books = df_books[df_books.book_id.isin(df_ratings.book_id)]
df_ratings = df_ratings[df_ratings.book_id.isin(df_books.book_id)]

In [5]:
df = df_ratings.copy()

In [6]:
df = df.sample(frac = 1, random_state = 42)

threshold = int(.9 * len(df))
train_ratings = df[:threshold]
test_ratings = df[threshold:]                  

reader = Reader(rating_scale = (1, 5))
data = Dataset.load_from_df(train_ratings[['user_id','book_id','rating']], reader)

In [7]:
train_set = data.build_full_trainset()
#algo = SVD(n_epochs=10, verbose = True, random_state=42)
#algo.fit(train_set)


from surprise import dump
#dump.dump('/.spyder-py3/files/input/' + 'svd',algo = algo)
_, algo  = dump.load('/.spyder-py3/files/input/'  + 'svd')

In [8]:
id = 27


print('Similar books to :', df_books[df_books.book_id == id]['original_title'].values[0])


iid = train_set.to_inner_iid(id)

dtypes = np.dtype([
          ('original_title', str),
          ('authors', str),
          ('sim', float),
          ])

simbooks = pd.DataFrame(np.empty(0, dtype=dtypes))

book_embedding = algo.qi
current_book = book_embedding[iid, :].reshape(1, -1)
total_books = book_embedding.shape[0]
print()
for i in range(total_books):
    sim = cosine_similarity(current_book, book_embedding[i,:].reshape(1, -1))
    rawid = train_set.to_raw_iid(i)
    book_title = df_books[df_books.book_id == rawid]['original_title'].values[0]
    authors = df_books[df_books.book_id == rawid]['authors'].values[0]

    simbooks = simbooks.append({'original_title' : book_title, 'authors' : authors, 
                      'sim' : sim }, ignore_index = True)
    
simbooks = simbooks.sort_values('sim', ascending = False)

var = simbooks.head(100)
simbooks.head(20)

Similar books to : Harry Potter and the Half-Blood Prince



Unnamed: 0,original_title,authors,sim
103,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré",[[1.0000000000000002]]
190,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré",[[0.9623025816285091]]
115,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",[[0.9565844028706822]]
315,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré",[[0.9559845454865237]]
61,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck",[[0.9427376235331739]]
263,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré",[[0.929397234585021]]
231,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",[[0.8725985243564407]]
2939,Complete Harry Potter Boxed Set,J.K. Rowling,[[0.6762299023804533]]
269,The Hunger Games,Suzanne Collins,[[0.4414095034124614]]
313,Catching Fire,Suzanne Collins,[[0.3701416492164164]]


In [9]:
df_genre = pd.read_csv('/.spyder-py3/files/input/genres.csv')

df_genre['genre'] = df_genre['genre'].str.split(',')
df_genre.head()

book_genre_df = df_genre[['original_title', 'genre']]
book_genre_df = book_genre_df.explode('genre').reset_index(drop=True)
book_genre_df.head(8)

merger = pd.merge(var, book_genre_df,on='original_title')

book_cross_table = pd.crosstab(merger['original_title'], merger['genre'])

jaccard_distances = pdist(book_cross_table.values, metric='jaccard')

jaccard_similarity_array = 1 - squareform(jaccard_distances)

jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=book_cross_table.index, columns=book_cross_table.index)

jaccard_similarity_series = jaccard_similarity_df.loc[df_books[df_books.book_id == id]['original_title'].values[0]]

ordered_similarities = jaccard_similarity_series.sort_values(ascending=False)

print('Similar books to :', df_books[df_books.book_id == id]['original_title'].values[0])
ordered_similarities.head(20)

Similar books to : Harry Potter and the Half-Blood Prince


original_title
Harry Potter and the Chamber of Secrets         1.0
Harry Potter and the Deathly Hallows            1.0
Harry Potter Collection (Harry Potter, #1-6)    1.0
Harry Potter Boxed Set Books 1-4                1.0
Harry Potter and the Half-Blood Prince          1.0
Harry Potter and the Order of the Phoenix       1.0
Harry Potter and the Philosopher's Stone        1.0
Harry Potter and the Prisoner of Azkaban        1.0
Howl's Moving Castle                            1.0
Eragon                                          1.0
The Sea of Monsters                             1.0
Eldest                                          1.0
The Tale of the Body Thief                      1.0
The Tales of Beedle the Bard                    1.0
The Titan's Curse                               1.0
Brisingr                                        1.0
The Last Olympian                               1.0
The Battle of the Labyrinth                     1.0
Ancestors of Avalon                             1