In [90]:
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
import random
import numpy as np

ratings_df = pd.read_csv(
    "data/Ratings.csv",
)  # ~1.15 mio samples
books_df = pd.read_csv("data/Books.csv")  # ~271 k samples

isbn_to_index = {isbn: idx for idx, isbn in enumerate(books_df["ISBN"].unique())}
user_to_index = {user: idx for idx, user in enumerate(ratings_df["user_id"].unique())}
index_to_isbn = {i: isbn for isbn, i in isbn_to_index.items()}

ratings_df["isbn_idx"] = ratings_df["isbn"].map(isbn_to_index)
ratings_df["user_idx"] = ratings_df["user_id"].map(user_to_index)

"""Summary of Data
- Books.csv contains book data points (id is ISBN)
- Ratings.csv contains ISBN, user, rating
- Users.csv contains user id, location, age
"""

  books_df = pd.read_csv("data/Books.csv")  # ~271 k samples


'Summary of Data\n- Books.csv contains book data points (id is ISBN)\n- Ratings.csv contains ISBN, user, rating\n- Users.csv contains user id, location, age\n'

In [91]:
# create sparse user-item matrix
row_indices = ratings_df["user_idx"].values
col_indices = ratings_df["isbn_idx"].values
data = ratings_df["rating"].values

sparse_user_item_matrix = csr_matrix(
    (data, (row_indices, col_indices)), shape=(len(user_to_index), len(isbn_to_index))
)

  self.coords = tuple(np.array(idx, copy=copy, dtype=idx_dtype)


In [92]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(sparse_user_item_matrix)
"Training complete"

100%|██████████| 15/15 [00:24<00:00,  1.63s/it]


'Training complete'

In [97]:
def get_similar_books(isbn, n=5):
    item_index = isbn_to_index[isbn]
    similar_items = model.similar_items(item_index, n)  # returns ([*idxs], [*scores])
    similar_books = [
        books_df[books_df["ISBN"] == index_to_isbn[idx]] for idx in similar_items[0]
    ]
    similarities = similar_items[1]
    return similar_books, similarities


def print_similar_books(isbn, n=5):
    recommendations = get_similar_books(isbn)

    book = books_df[books_df["ISBN"] == isbn]
    book_title = book["Book-Title"].values[0]
    similar_books = [
        (book["Book-Title"].values[0], float(sim))
        for book, sim in zip(*recommendations)
    ]

    print(book_title, similar_books)


def compute_book_similarity(isbn1, isbn2):
    index1 = isbn_to_index[isbn1]
    index2 = isbn_to_index[isbn2]

    vec1 = model.item_factors[index1]
    vec2 = model.item_factors[index2]

    similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return float(similarity)

In [101]:
# pass a random book and return similar books
isbn = random.choice(list(index_to_isbn.values()))

recommendations = get_similar_books(isbn)

book = books_df[books_df["ISBN"] == isbn]
book_title = book["Book-Title"].values[0]
similar_books = [
    (book["Book-Title"].values[0], float(sim))
    for book, sim in zip(*recommendations)
]

book_title, similar_books

('Oration on the Dignity of Man',
 [('Oration on the Dignity of Man', 1.0000001192092896),
  ('The Communist Manifesto', 0.9999790191650391),
  ('The Man Who Mistook His Wife For A Hat : And Other Clinical Tales',
   0.9317840933799744),
  ('Reading in the Dark : A Novel', 0.7559765577316284),
  ('The Gnostic Gospels', 0.704039990901947)])

In [102]:
# return books similar to Harry Potter 1
hp_1_isbn = "0590353403"

recommendations = get_similar_books(hp_1_isbn)

hp_1 = books_df[books_df["ISBN"] == hp_1_isbn]
hp_1_title = hp_1["Book-Title"].values[0]
similar_books = [
    (book["Book-Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
hp_1_title, similar_books

("Harry Potter and the Sorcerer's Stone (Book 1)",
 [("Harry Potter and the Sorcerer's Stone (Book 1)", 0.9999998807907104),
  ('Harry Potter and the Chamber of Secrets (Book 2)', 0.973667323589325),
  ('Sugar Ray Leonard', 0.966667890548706),
  ('Thomas the Tank Engine: The Complete Collection (Railway Series)',
   0.9560786485671997),
  ('Treasures from the Royal Tombs of Ur', 0.9468263983726501)])

In [103]:
pon_isbn = "1577311523"

recommendations = get_similar_books(pon_isbn)

pon = books_df[books_df["ISBN"] == pon_isbn]
pon_title = pon["Book-Title"].values[0]
similar_books = [
    (book["Book-Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
pon_title, similar_books

('The Power of Now: A Guide to Spiritual Enlightenment',
 [('The Power of Now: A Guide to Spiritual Enlightenment', 0.9999999403953552),
  ('The Rose Window and Other Verse from New Poems', 0.8080865144729614),
  ('Kundun: A Biography of the Family of the Dalai Lama', 0.8080853223800659),
  ('Karma and Reincarnation: Transcending Your Past, Transforming Your Future (Pocket Guides to Practical Spirituality Series)',
   0.7722973227500916),
  ('Empowerment Through Reiki', 0.7714309096336365)])

In [104]:
# compare Dune 1 to Dune 2
dune_1_isbn = "0425080021"
dune_2_isbn = "0441172695"

similarity = compute_book_similarity(dune_1_isbn, dune_2_isbn)
print(similarity)

# compare LOTR 2 to Dune 1
lotr_2_isbn = "0345339711"
similarity = compute_book_similarity(dune_1_isbn, lotr_2_isbn)
print(similarity)

0.6051822900772095
0.4695863127708435


In [105]:
# create new .csv with isbn, title, author, embedding
item_factors = model.item_factors
isbn_to_embedding = {
    isbn: item_factors[isbn_to_index[isbn]]
    for isbn in isbn_to_index.keys()
    if isbn_to_index[isbn] < len(item_factors)
}
latent_factors_df = books_df[["ISBN", "Book-Title", "Book-Author"]]
latent_factors_df["embedding"] = latent_factors_df["ISBN"].map(isbn_to_embedding)

# save to CSV
latent_factors_df.to_csv("data/books_with_embeddings.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latent_factors_df["embedding"] = latent_factors_df["ISBN"].map(isbn_to_embedding)
