In [26]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import AlgoBase
from collections import defaultdict
from sentence_transformers import util

In [None]:
# LOAD THE REAL BOOK DATA

df_books = pd.read_csv("Books_df.csv")

# Ensure there's a 'book_id' column
if 'book_id' not in df_books.columns:
    df_books.insert(0, 'book_id', range(len(df_books)))

def create_metadata_text(row):
    return (
        f"Title: {row['Title']} | "
        f"Author: {row['Author']} | "
        f"Main Genre: {row['Main Genre']} | "
        f"Sub Genre: {row['Sub Genre']}"
    )
df_books['text_for_embedding'] = df_books.apply(create_metadata_text, axis=1)

In [28]:
# CREATE EMBEDDINGS FOR EACH BOOK
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = df_books["text_for_embedding"].tolist()
embeddings_tensor = model.encode(texts, convert_to_tensor=True)

book_embeddings = {}
for idx, row in df_books.iterrows():
    b_id = row['book_id']
    book_embeddings[b_id] = embeddings_tensor[idx]

In [None]:
# SYNTHETICALLY GENERATE 50 USERS' RATINGS
num_users = 20
prob_rate = 0.4  # each user has a 40% chance to rate any given book

ratings_data = []
np.random.seed(42)  # for reproducibility


all_book_ids = df_books['book_id'].tolist()

for user_id in range(num_users):
    for book_id in all_book_ids:
        # Decide if this user rates this book
        if np.random.rand() < prob_rate:
            # random rating from 1..5
            rating_val = np.random.randint(1, 6)  # 1 to 5
            ratings_data.append((user_id, book_id, rating_val))

df_ratings = pd.DataFrame(ratings_data, columns=["user_id", "book_id", "rating"])
print(f"Synthetic Ratings: {df_ratings.shape[0]} total ratings from {num_users} users.")

Synthetic Ratings: 63291 total ratings from 20 users.


In [30]:
# CREATE A SURPRISE DATASET
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[["user_id", "book_id", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [31]:
# DEFINE A CUSTOM EMBEDDING-BASED ALGO
class EmbeddingBased(AlgoBase):
    def __init__(self, book_embeddings, k=10):
        AlgoBase.__init__(self)
        self.book_embeddings = book_embeddings
        self.k = k
    
    def fit(self, trainset):
        super().fit(trainset)
        return self
    
    def estimate(self, u, i):
        # Convert internal IDs to raw IDs
        try:
            user_id = self.trainset.to_raw_uid(u)
        except ValueError:
            return self.trainset.global_mean
        
        try:
            book_id = self.trainset.to_raw_iid(i)
        except ValueError:
            return self.trainset.global_mean
        
        # If no embedding for this book => cold start fallback
        if book_id not in self.book_embeddings:
            return self.trainset.global_mean
        
        # Gather user's rated items
        user_ratings = self.trainset.ur[u]  # list of (inner_item_id, rating)
        if not user_ratings:
            return self.trainset.global_mean
        
        target_emb = self.book_embeddings[book_id]
        
        scores_sims = []
        for (j_inner, rating_j) in user_ratings:
            j_raw = self.trainset.to_raw_iid(j_inner)
            if j_raw in self.book_embeddings:
                j_emb = self.book_embeddings[j_raw]
                sim_val = float(util.cos_sim(target_emb, j_emb)[0][0])
                scores_sims.append((rating_j, sim_val))
        
        if not scores_sims:
            return self.trainset.global_mean
        
        # Sort by similarity descending
        scores_sims.sort(key=lambda x: x[1], reverse=True)
        top_k = scores_sims[: self.k]
        
        numerator = sum(r * s for (r, s) in top_k)
        denominator = sum(s for (_, s) in top_k)
        
        if denominator == 0:
            return self.trainset.global_mean
        return numerator / denominator

In [32]:
algo_llm = EmbeddingBased(book_embeddings, k=10)
algo_llm.fit(trainset)
predictions_llm = algo_llm.test(testset)
rmse_llm = accuracy.rmse(predictions_llm, verbose=True)

RMSE: 1.4743


In [33]:
def precision_recall(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating descending
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # number of relevant items
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        
        # number of recommended items in top k that are relevant
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        precision = n_rec_k / k if k > 0 else 1
        recall = n_rec_k / n_rel if n_rel != 0 else 1
        
        precisions.append(precision)
        recalls.append(recall)
    
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    return mean_precision, mean_recall

In [34]:
p_llm, r_llm = precision_recall(predictions_llm, k=10, threshold=3.5)
print(f"\nPrecision@10: {p_llm:.4f}, Recall@10: {r_llm:.4f}")


Precision@10: 0.4050, Recall@10: 0.0162
