In [14]:
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
import random
import numpy as np

ratings_df = pd.read_csv(
    "data/Ratings.csv",
)  # ~1.15 mio samples
books_df = pd.read_csv("data/Books.csv")  # ~271 k samples

isbn_to_index = {isbn: idx for idx, isbn in enumerate(books_df["ISBN"].unique())} # covers all isbns from books
user_to_index = {user: idx for idx, user in enumerate(ratings_df["user_id"].unique())} # covers all users from ratings
index_to_isbn = {i: isbn for isbn, i in isbn_to_index.items()} # contains 

# create indices of books in df
ratings_df["isbn_idx"] = ratings_df["isbn"].map(isbn_to_index) 
books_df["isbn_idx"] = books_df["ISBN"].map(isbn_to_index)

# create indices of users in df
ratings_df["user_idx"] = ratings_df["user_id"].map(user_to_index) 

"""Summary of Data
- Books.csv contains all books rated in Ratings.csv, format: ISBN, Book-Title, Book-Author, ...
- Ratings.csv contains all ratings, format: user_id, isbn, rating
- Users.csv contains information about the users linked to ratings, format: User-ID, Location, Age
"""

  books_df = pd.read_csv("data/Books.csv")  # ~271 k samples


'Summary of Data\n- Books.csv contains all books rated in Ratings.csv, format: ISBN, Book-Title, Book-Author, ...\n- Ratings.csv contains all ratings, format: user_id, isbn, rating\n- Users.csv contains information about the users linked to ratings, format: User-ID, Location, Age\n'

In [15]:
# create sparse user-item matrix
row_indices = ratings_df["user_idx"].values
col_indices = ratings_df["isbn_idx"].values
data = ratings_df["rating"].values

# compressed sparse row matrix, as desired by `implicit`
sparse_user_item_matrix = csr_matrix(
    (data, (row_indices, col_indices)), shape=(len(user_to_index), len(isbn_to_index))
)

  self.coords = tuple(np.array(idx, copy=copy, dtype=idx_dtype)


In [16]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(sparse_user_item_matrix)
"Training complete"

100%|██████████| 15/15 [00:24<00:00,  1.63s/it]


'Training complete'

In [17]:
def get_similar_books(isbn, n=5):
    isbn_idx = isbn_to_index[isbn]
    similar_items, similarities = model.similar_items(isbn_idx, n)  # returns ([*idxs], [*scores])
    similar_books = [
        books_df[books_df["isbn_idx"] == idx] for idx in similar_items
    ]
    return similar_books, similarities

def compute_book_similarity(isbn1, isbn2):
    isbn_idx_1 = isbn_to_index[isbn1]
    isbn_idx_2 = isbn_to_index[isbn2]

    embedding_1 = model.item_factors[isbn_idx_1]
    embedding_2 = model.item_factors[isbn_idx_2]

    similarity = np.dot(embedding_1, embedding_2) / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
    return float(similarity)

In [18]:
# pass a random book from Books.csv and return similar books
isbn = random.choice(list(index_to_isbn.values()))

similar_books, similarities = get_similar_books(isbn)

book = books_df[books_df["ISBN"] == isbn]
book_title = book["Book-Title"].values[0]
similar_books_formatted = [
    (similar_book["Book-Title"].values[0], float(sim))
    for similar_book, sim in zip(similar_books, similarities)
]

book_title, similar_books_formatted

('Cupid Connection  (By Request) (Harlequin by Request)',
 [('Cupid Connection  (By Request) (Harlequin by Request)',
   1.0000001192092896),
  ('Grand reportage', 0.9999961256980896),
  ('Funny Cats', 0.9999956488609314),
  ('Virus', 0.9999955296516418),
  ('A Scandalous Engagement (Thorndike Harlequin I Romance)',
   0.9999952912330627)])

In [19]:
# return books similar to Harry Potter 1 by J. K. Rowling
hp_1_isbn = "0590353403"

recommendations = get_similar_books(hp_1_isbn)

hp_1 = books_df[books_df["ISBN"] == hp_1_isbn]
hp_1_title = hp_1["Book-Title"].values[0]
similar_books = [
    (book["Book-Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
hp_1_title, similar_books

("Harry Potter and the Sorcerer's Stone (Book 1)",
 [("Harry Potter and the Sorcerer's Stone (Book 1)", 1.0),
  ('Sugar Ray Leonard', 0.9683141112327576),
  ('Harry Potter and the Chamber of Secrets (Book 2)', 0.9617767333984375),
  ('Treasures from the Royal Tombs of Ur', 0.9547505378723145),
  ('Thomas the Tank Engine: The Complete Collection (Railway Series)',
   0.952839732170105)])

In [20]:
# return books similar to The Power Of Now by Eckhart Tolle
pon_isbn = "1577311523"

recommendations = get_similar_books(pon_isbn)

pon = books_df[books_df["ISBN"] == pon_isbn]
pon_title = pon["Book-Title"].values[0]
similar_books = [
    (book["Book-Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
pon_title, similar_books

('The Power of Now: A Guide to Spiritual Enlightenment',
 [('The Power of Now: A Guide to Spiritual Enlightenment', 1.0),
  ('Empowerment Through Reiki', 0.8555602431297302),
  ('Astral Travel for Beginners (For Beginners)', 0.8470998406410217),
  ('Elementary Tarot', 0.8458982110023499),
  ('Everyday Grace: Having Hope, Finding Forgiveness, and Making Miracles',
   0.8441833853721619)])

In [21]:
# compare Dune 1 to Dune 2 (expecting high similarity)
dune_1_isbn = "0425080021"
dune_2_isbn = "0441172695"

similarity = compute_book_similarity(dune_1_isbn, dune_2_isbn)
print(similarity)

# compare Dune 1 to LOTR 1 (expecting moderate similarity)
lotr_1_isbn = "0345339703"
similarity = compute_book_similarity(dune_1_isbn, lotr_1_isbn)
print(similarity)

# compare Dune 1 to The Power Of Now (expecting low similarity)
pon_isbn = "1577311523"
similarity = compute_book_similarity(dune_1_isbn, pon_isbn)
print(similarity)

0.5740084648132324
0.31848710775375366
0.018669774755835533


In [22]:
# create new .csv with isbn, title, author, embedding
item_factors = model.item_factors
isbn_to_embedding = {
    isbn: item_factors[isbn_to_index[isbn]]
    for isbn in isbn_to_index.keys()
}
latent_factors_df = books_df[["ISBN", "Book-Title", "Book-Author"]]
latent_factors_df["embedding"] = latent_factors_df["ISBN"].map(isbn_to_embedding)

# save books_with_embeddings.csv
latent_factors_df.to_csv("data/books_with_embeddings.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latent_factors_df["embedding"] = latent_factors_df["ISBN"].map(isbn_to_embedding)
