In [1]:
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
import random
import numpy as np

data_dir = "archive"

ratings_df = pd.read_csv(
    f"{data_dir}/Books_rating.csv",
)
books_df = pd.read_csv(f"{data_dir}/books_data.csv")

title_to_index = {
    title: idx for idx, title in enumerate(books_df["Title"].unique())
}  # covers all isbns from books
user_to_index = {
    user: idx for idx, user in enumerate(ratings_df["User_id"].unique())
}  # covers all users from ratings

# create indices of books in df
ratings_df["title_idx"] = ratings_df["Title"].map(title_to_index)
books_df["title_idx"] = books_df["Title"].map(title_to_index)

# create indices of users in df
ratings_df["user_idx"] = ratings_df["User_id"].map(user_to_index)

"""Summary of Kaggle Data
- Books.csv contains all books rated in Ratings.csv, format: ISBN, Book-Title, Book-Author, ...
- Ratings.csv contains all ratings, format: user_id, isbn, rating
- Users.csv contains information about the users linked to ratings, format: User-ID, Location, Age
"""

"""Summary of Amazon Data
- Books from books_data.csv and Books_rating are connected via title
"""

  from .autonotebook import tqdm as notebook_tqdm


'Summary of Amazon Data\n- Books from books_data.csv and Books_rating are connected via title\n'

In [2]:
# create sparse user-item matrix
row_indices = ratings_df["user_idx"].values
col_indices = ratings_df["title_idx"].values
data = ratings_df["review/score"].values

# compressed sparse row matrix, as desired by `implicit`
sparse_user_item_matrix = csr_matrix(
    (data, (row_indices, col_indices)), shape=(len(user_to_index), len(title_to_index))
)

In [3]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(sparse_user_item_matrix)
"Training complete"

  check_blas_config()
100%|██████████| 15/15 [01:13<00:00,  4.88s/it]


'Training complete'

In [7]:
def get_book_recommendations(title, n=10):
    title_idx = title_to_index[title]
    similar_items, scores = model.similar_items(
        title_idx, n
    )  # returns ([*idxs], [*scores])
    recommendations = [books_df[books_df["title_idx"] == idx] for idx in similar_items]
    return recommendations, scores


def compute_recommendation_score(title1, title2):
    title_idx_1 = title_to_index[title1]
    title_idx_2 = title_to_index[title2]

    embedding_1 = model.item_factors[title_idx_1]
    embedding_2 = model.item_factors[title_idx_2]

    similarity = float(
        np.dot(embedding_1, embedding_2)
        / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
    )
    return similarity

In [10]:
# pass a random book from Books.csv and return similar books
book_title = random.choice(list(title_to_index.keys()))

recommendations, scores = get_book_recommendations(book_title)

book_title = books_df[books_df["Title"] == book_title]
book_title = book_title["Title"].values[0]
similar_books_formatted = [
    (similar_book["Title"].values[0], float(sim))
    for similar_book, sim in zip(recommendations, scores)
]

book_title, similar_books_formatted

('Change Agents Guide to Innovation in Education',
 [('Change Agents Guide to Innovation in Education', 1.0000001192092896),
  ('How Leo Learned to Be King', 0.9387609362602234),
  ('Good City Form', 0.9358565211296082),
  ('BLADUD OF BATH: THE BRITISH KING WHO TRIED TO FLY: EXTRACTS FROM OLD CHRONICLES AND HISTORIES RELATING TO BLADUD, THE NINTH KING OF ENGLAND, TOGETHER WITH SEVERAL PORTRAITS.',
   0.935764729976654),
  ('A guide to Arkansas horse trails', 0.9332634210586548),
  ('Reflections on the Jesus Prayer', 0.9322771430015564),
  ('Yamaha Xt350 & Tt350 1985-2000 (Clymer Motorcycle Repair)',
   0.9314846396446228),
  ('Released to Reign', 0.9310274720191956),
  ("Imagining Philadelphia: Travelers' Views of the City from 1800 to the Present",
   0.9304955005645752),
  ('Pink Floyd: Bricks in the Wall', 0.929036557674408)])

In [11]:
# return books similar to Harry Potter 1 by J. K. Rowling
book_title = "Harry Potter and The Sorcerer's Stone"

recommendations = get_book_recommendations(book_title)

hp_1 = books_df[books_df["Title"] == book_title]
hp_1_title = hp_1["Title"].values[0]
recommendations = [
    (book["Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
hp_1_title, recommendations

("Harry Potter and The Sorcerer's Stone",
 [("Harry Potter and The Sorcerer's Stone", 1.0),
  ('God and Production in a Guatemalan Town (Texas Pan American Series)',
   0.9932220578193665),
  ("Killer Clown of King's County (Bone Chillers)", 0.9927816987037659),
  ('Dido and Aeneas', 0.9927751421928406),
  ('Letterman Wit: His Life and Humor', 0.9927705526351929),
  ('The Mystic Coast: A Photographic Portrait', 0.9927697777748108),
  ('The Wayside Motor Inn: A play in two acts', 0.9927555918693542),
  ('Consider the Lilies of the Field: a Novel', 0.9927428960800171),
  ('Getaways For Gourmets In The Northeast', 0.9927042722702026),
  ('Speak to the earth;: Wanderings and reflections among elephants and mountains,',
   0.9926993250846863)])

In [12]:
# return books similar to New Earth by Eckhart Tolle
book_title = "The Hobbit"

recommendations = get_book_recommendations(book_title)

pon = books_df[books_df["Title"] == book_title]
pon_title = pon["Title"].values[0]
recommendations = [
    (book["Title"].values[0], float(sim)) for book, sim in zip(*recommendations)
]
pon_title, recommendations

('The Hobbit',
 [('The Hobbit', 0.9999998807907104),
  ('The Hobbit There and Back Again', 0.9997938871383667),
  ('The Hobbitt, or there and back again; illustrated by the author.',
   0.9997845888137817),
  ('The Hobbit; Or, There and Back Again', 0.9997667074203491),
  ('The Hobbit or There and Back Again', 0.9997621178627014),
  ('Pagan Tarot Kit (English and Spanish Edition)', 0.9846198558807373),
  ("Santa Claus's partner", 0.9838884472846985),
  ('The Phone Book: Telephone Skills for Business Success Student Text',
   0.9838793873786926),
  ('Starting And Running A B And B (2Ed)', 0.98387211561203),
  ('Duel of Wits', 0.983867347240448)])

In [13]:
# compare HP 1 to HP 2 (expecting high similarity)
hp_1 = "Harry Potter and The Sorcerer's Stone"
hp_2 = "Harry Potter and the Chamber of Secrets"

similarity = compute_recommendation_score(hp_1, hp_2)
print(similarity)

# compare HP 1 to LOTR (expecting moderate similarity)
lotr = "The Fellowship of the Ring"
similarity = compute_recommendation_score(hp_1, lotr)
print(similarity)

# compare HP 1 to New Earth (expecting low similarity)
new_earth = "NEW EARTH"
similarity = compute_recommendation_score(hp_1, new_earth)
print(similarity)

# compare How to Win Friends and Influence People to Seven Habits of Highly Effective People (expecting high similarity)
htwf = "How to Win Friends & Influence People (Cardinal Editions, C 303)"
seven_habits = "The 7 Habits of Highly Effective People (50 card deck)"
similarity = compute_recommendation_score(htwf, seven_habits)
print(similarity)

0.9560022354125977
0.14164608716964722
0.05507197231054306
0.3168710470199585


In [48]:
# create new .csv with isbn, title, author, embedding
item_factors = model.item_factors
isbn_to_embedding = {
    isbn: item_factors[title_to_index[isbn]] for isbn in title_to_index.keys()
}
latent_factors_df = books_df[["Title", "authors"]]
latent_factors_df["embedding"] = latent_factors_df["Title"].map(isbn_to_embedding)

# save books_with_embeddings.csv
latent_factors_df.to_csv(f"{data_dir}/books_with_embeddings.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latent_factors_df["embedding"] = latent_factors_df["Title"].map(isbn_to_embedding)
