In [20]:
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
import random
import numpy as np

data_dir = "archive"

ratings_df = pd.read_csv(
    f"{data_dir}/Books_rating.csv",
)  # ~1.15 mio samples
books_df = pd.read_csv(f"{data_dir}/books_data.csv")  # ~271 k samples

title_to_index = {title: idx for idx, title in enumerate(books_df["Title"].unique())} # covers all isbns from books
user_to_index = {user: idx for idx, user in enumerate(ratings_df["User_id"].unique())} # covers all users from ratings

# create indices of books in df
ratings_df["title_idx"] = ratings_df["Title"].map(title_to_index) 
books_df["title_idx"] = books_df["Title"].map(title_to_index)

# create indices of users in df
ratings_df["user_idx"] = ratings_df["User_id"].map(user_to_index) 

"""Summary of Kaggle Data
- Books.csv contains all books rated in Ratings.csv, format: ISBN, Book-Title, Book-Author, ...
- Ratings.csv contains all ratings, format: user_id, isbn, rating
- Users.csv contains information about the users linked to ratings, format: User-ID, Location, Age
"""

"""Summary of Amazon Data
- Books from books_data.csv and Books_rating are connected via title
"""

'Summary of Amazon Data\n- Books from books_data.csv and Books_rating are connected via title\n'

In [21]:
# create sparse user-item matrix
row_indices = ratings_df["user_idx"].values
col_indices = ratings_df["title_idx"].values
data = ratings_df["review/score"].values

# compressed sparse row matrix, as desired by `implicit`
sparse_user_item_matrix = csr_matrix(
    (data, (row_indices, col_indices)), shape=(len(user_to_index), len(title_to_index))
)

In [22]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(sparse_user_item_matrix)
"Training complete"

100%|██████████| 15/15 [01:34<00:00,  6.30s/it]


'Training complete'

In [23]:
def get_book_recommendations(title, n=5):
    title_idx = title_to_index[title]
    similar_items, scores = model.similar_items(title_idx, n)  # returns ([*idxs], [*scores])
    recommendations = [
        books_df[books_df["title_idx"] == idx] for idx in similar_items
    ]
    return recommendations, scores

def compute_recommendation_score(title1, title2):
    title_idx_1 = title_to_index[title1]
    title_idx_2 = title_to_index[title2]

    embedding_1 = model.item_factors[title_idx_1]
    embedding_2 = model.item_factors[title_idx_2]

    similarity = float(np.dot(embedding_1, embedding_2) / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2)))
    return similarity

In [26]:
# pass a random book from Books.csv and return similar books
book_title = random.choice(list(title_to_index.keys()))

recommendations, scores = get_book_recommendations(book_title)

book_title = books_df[books_df["Title"] == book_title]
book_title = book_title["Title"].values[0]
similar_books_formatted = [
    (similar_book["Title"].values[0], float(sim))
    for similar_book, sim in zip(recommendations, scores)
]

book_title, similar_books_formatted

('Retraced',
 [('Retraced', 0.9999998807907104),
  ('The Looking Heart: Poetic Expressions from Within', 0.9999772310256958),
  ('Reborn', 0.9999348521232605),
  ('The Language of Saxophones : Selected Poems of Kamau Daood',
   0.9998949766159058),
  ('A Brownstone in Brooklyn', 0.9998934268951416)])

In [27]:
# return books similar to Harry Potter 1 by J. K. Rowling
book_title = "Harry Potter and The Sorcerer's Stone"

recommendations = get_book_recommendations(book_title)

hp_1 = books_df[books_df["Title"] == book_title]
hp_1_title = hp_1["Title"].values[0]
recommendations = [
    (book["Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
hp_1_title, recommendations

("Harry Potter and The Sorcerer's Stone",
 [("Harry Potter and The Sorcerer's Stone", 1.0),
  ('God and Production in a Guatemalan Town (Texas Pan American Series)',
   0.9941427111625671),
  ('50 Hikes in Central Florida: Hikes, Walks, and Backpacks in the Heart of the Peninsula',
   0.9939441680908203),
  ('The Usborne History of the Twentieth Century (History of the Modern World)',
   0.9936173558235168),
  ("Killer Clown of King's County (Bone Chillers)", 0.9936156272888184)])

In [28]:
# return books similar to The Power Of Now by Eckhart Tolle
book_title = "NEW EARTH"

recommendations = get_book_recommendations(book_title)

pon = books_df[books_df["Title"] == book_title]
pon_title = pon["Title"].values[0]
recommendations = [
    (book["Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
pon_title, recommendations

('NEW EARTH',
 [('NEW EARTH', 1.0000001192092896),
  ('The book of woodcraft', 0.9984115362167358),
  ('The book of woodcraft,: With 500 drawings,', 0.9984114766120911),
  ('Larousse Pocket Student Dictionary French-English/ English-French (French Edition)',
   0.9980927109718323),
  ('Poker Secrets From Poker Champs DVD', 0.9980213642120361)])

In [33]:
# compare HP 1 to HP 2 (expecting high similarity)
hp_1 = "Harry Potter and The Sorcerer's Stone"
hp_2 = "Harry Potter and the Chamber of Secrets"

similarity = compute_recommendation_score(hp_1, hp_2)
print(similarity)

# compare HP 1 to LOTR (expecting moderate similarity)
lotr = "The Fellowship of the Ring"
similarity = compute_recommendation_score(hp_1, lotr)
print(similarity)

# compare HP 1 to New Earth (expecting low similarity)
new_earth = "NEW EARTH"
similarity = compute_recommendation_score(hp_1, new_earth)
print(similarity)

# compare How to Win Friends and Influence People to Seven Habits of Highly Effective People (expecting high similarity)
htwf = 'How to Win Friends & Influence People (Cardinal Editions, C 303)'
seven_habits = "The 7 Habits of Highly Effective People (50 card deck)"
similarity = compute_recommendation_score(htwf, seven_habits)
print(similarity)

0.9364229440689087
0.14312927424907684
0.04937087371945381
0.18868790566921234


In [22]:
# create new .csv with isbn, title, author, embedding
item_factors = model.item_factors
isbn_to_embedding = {
    isbn: item_factors[title_to_index[isbn]]
    for isbn in title_to_index.keys()
}
latent_factors_df = books_df[["Title", "authors"]]
latent_factors_df["embedding"] = latent_factors_df["Title"].map(isbn_to_embedding)

# save books_with_embeddings.csv
latent_factors_df.to_csv(f"{data_dir}/books_with_embeddings.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latent_factors_df["embedding"] = latent_factors_df["ISBN"].map(isbn_to_embedding)
