In [197]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

In [198]:
df = pd.read_csv("../data/cleaned_book_ratings_plus.csv")
books=df.drop_duplicates(subset='isbn')
print("rows,cols:", df.shape)

rows,cols: (98605, 16)


In [199]:
books.duplicated('title').sum()

1024

In [200]:
df["user_id"] = df["user_id"].astype(str)
df["isbn"] = df["isbn"].astype(str)

In [201]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98605 entries, 0 to 98604
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      98605 non-null  int64  
 1   user_id         98605 non-null  object 
 2   isbn            98605 non-null  object 
 3   book_rating     98605 non-null  int64  
 4   location        98605 non-null  object 
 5   user_age        98605 non-null  float64
 6   title           98605 non-null  object 
 7   author          98605 non-null  object 
 8   year            98605 non-null  float64
 9   publisher       98605 non-null  object 
 10  img_url         98605 non-null  object 
 11  num_of_rating   98605 non-null  int64  
 12  fav_author      98605 non-null  object 
 13  fav_publisher   98605 non-null  object 
 14  avg_book_rate   98605 non-null  float64
 15  weighted_score  98605 non-null  float64
dtypes: float64(4), int64(3), object(9)
memory usage: 12.0+ MB


# Popularity filtering

In [202]:
top_rated_books = df.sort_values("weighted_score", ascending=False)
top_rated_books=top_rated_books.drop_duplicates(subset=['isbn'])

In [203]:
def get_top_n_books(n):
    """
    Returns the top n books from top_rated_books DataFrame
    with columns: title, num_of_rating, avg_book_rate, weighted_score.
    """
    return top_rated_books[
        ["title", "num_of_rating", "avg_book_rate", "weighted_score"]
    ].head(n)

In [204]:
get_top_n_books(10)

Unnamed: 0,title,num_of_rating,avg_book_rate,weighted_score
63324,Harry Potter and the Goblet of Fire (Book 4),171,9.463918,8.936166
26932,Harry Potter and the Prisoner of Azkaban (Book 3),192,9.300971,8.863494
42903,"The Two Towers (The Lord of the Rings, Part 2)",90,9.75,8.839573
91888,Harry Potter and the Chamber of Secrets (Book 2),221,9.166667,8.808137
35755,"The Two Towers (The Lord of the Rings, Part 2)",90,9.6875,8.807613
14299,Harry Potter and the Sorcerer's Stone (Book 1),117,9.333333,8.720517
30408,Harry Potter and the Goblet of Fire (Book 4),171,9.112676,8.702461
92986,Harry Potter and the Chamber of Secrets (Book 2),221,9.010204,8.695505
41984,Harry Potter and the Prisoner of Azkaban (Book 3),192,9.057471,8.695321
32521,A Time to Kill,125,9.25,8.694383


# Data Split

Split ratings into training set and test set so that:

- Each user has at least num_of_rates_in_test ratings placed in the test set.

- The rest of the ratings stay in the train set.

This ensures every user appears in both train and test, which is crucial for evaluating recommendation systems.

In [205]:
num_users_one_rate = df.groupby('user_id').size().eq(2).sum()
print("Number of users with only one rating:", num_users_one_rate)

Number of users with only one rating: 0


In [206]:
def train_test_split_one_in_test(df,num_of_rates_in_test=1):
    grouped=df.groupby('user_id')
    train_list=[]
    test_list=[]
    for uid,g in grouped:
        if len(g) <= num_of_rates_in_test:
            test_list.append(g.index.values)
            continue

        test_idx = g.sample(n=num_of_rates_in_test,random_state=42).index
        train_idx = g.index.difference(test_idx)
        test_list.extend(list(test_idx))
        train_list.extend(list(train_idx))

    train_df=df.loc[train_list]
    test_df=df.loc[test_list]
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [207]:
train_df, test_df=train_test_split_one_in_test(df)
print("train:", train_df.shape, "test:", test_df.shape)

train: (90402, 16) test: (8203, 16)


# Item Based Collaborative Filtering

In [208]:
user_item_matrix = train_df.pivot_table(
    index="user_id", columns="isbn", values="book_rating", fill_value=0
)

In [209]:
user_item_matrix

isbn,0002251760,0002550563,0003300277,000648302X,0006485200,0006551971,0006742939,0007110928,0007154615,000716226X,...,8495359537,8495501090,8495501198,8495501465,849550152X,8495618605,950491036X,9505156642,9505156944,9871138148
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## based on Correlation

- Compute **cosine similarity** between items (columns of the user–item matrix).  
- For a target user:
  1. Look at books the user rated highly.  
  2. Find similar books (using cosine similarity).  
  3. Aggregate across multiple liked books.  
  4. Recommend the most similar unseen books.  

In [210]:
def recommend_books(book_isbn,min_ratings):
    book_ratings = user_item_matrix[book_isbn]
    similar_books = user_item_matrix.corrwith(book_ratings)
    corr_book = pd.DataFrame(similar_books, columns=["pearsonR"])
    corr_book.dropna(inplace=True)
    
    meta = books.set_index("isbn")[["num_of_rating",'title']]
    corr_book = corr_book.join(meta)

    top_books=corr_book[corr_book['num_of_rating']>=min_ratings]
    top_books = top_books.sort_values("pearsonR", ascending=False).head(10)

    return top_books
recommended_books = recommend_books('059035342X', 100)
display(recommended_books)

Unnamed: 0_level_0,pearsonR,num_of_rating,title
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
059035342X,1.0,179,Harry Potter and the Sorcerer's Stone (Harry P...
0439064872,0.321557,221,Harry Potter and the Chamber of Secrets (Book 2)
0439136369,0.273287,192,Harry Potter and the Prisoner of Azkaban (Book 3)
0439139597,0.17189,171,Harry Potter and the Goblet of Fire (Book 4)
043935806X,0.157028,151,Harry Potter and the Order of the Phoenix (Boo...
0439139600,0.152477,171,Harry Potter and the Goblet of Fire (Book 4)
0439136350,0.149267,192,Harry Potter and the Prisoner of Azkaban (Book 3)
0439064864,0.102244,221,Harry Potter and the Chamber of Secrets (Book 2)
0618260269,0.078183,118,The Fellowship of the Ring (The Lord of the Ri...
0345339703,0.070397,118,The Fellowship of the Ring (The Lord of the Ri...


## based on Cosine similarity


In [211]:

sparse_user_item = csr_matrix(user_item_matrix.values)

In [212]:
sparse_user_item

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 90402 stored elements and shape (8203, 10135)>

In [213]:
# item-user matrix (transpose so rows = items, cols = users)
item_user_matrix = sparse_user_item.T  # shape: n_items × n_users

# cosine similarity between items
item_sim_matrix = cosine_similarity(item_user_matrix, dense_output=False)

# map ISBNs to column indices
isbn_list = user_item_matrix.columns.tolist()
isbn_to_idx = {isbn: i for i, isbn in enumerate(isbn_list)}

In [214]:
def recommend_books_cosine(book_isbn, min_ratings=50, top_n=10):
    if book_isbn not in isbn_to_idx:
        return pd.DataFrame()  # return empty DataFrame instead of None

    idx = isbn_to_idx[book_isbn]
    # similarity scores for this book vs all others
    sim_scores = item_sim_matrix[idx].toarray().ravel()

    # build DataFrame of scores (use "similarity" instead of "cosine_sim")
    corr_book = pd.DataFrame({"isbn": isbn_list, "similarity": sim_scores})

    # drop self (the book itself will have sim=1)
    corr_book = corr_book[corr_book["isbn"] != book_isbn]

    # join metadata (safe join)
    meta = books.set_index("isbn")[["num_of_rating", "title"]]
    corr_book = corr_book.set_index("isbn").join(meta, how="inner").reset_index()

    # filter and sort
    top_books = corr_book[corr_book["num_of_rating"] >= min_ratings]
    top_books = top_books.sort_values("similarity", ascending=False).head(top_n)

    return top_books


# Example
recommended_books = recommend_books_cosine("059035342X", min_ratings=100)
display(recommended_books)

Unnamed: 0,isbn,similarity,num_of_rating,title
3961,0439064872,0.331849,221,Harry Potter and the Chamber of Secrets (Book 2)
3968,0439136369,0.282681,192,Harry Potter and the Prisoner of Azkaban (Book 3)
3969,0439139597,0.184271,171,Harry Potter and the Goblet of Fire (Book 4)
3973,043935806X,0.172441,151,Harry Potter and the Order of the Phoenix (Boo...
3970,0439139600,0.163163,171,Harry Potter and the Goblet of Fire (Book 4)
3967,0439136350,0.161913,192,Harry Potter and the Prisoner of Azkaban (Book 3)
3960,0439064864,0.11533,221,Harry Potter and the Chamber of Secrets (Book 2)
1767,0345339703,0.081822,118,The Fellowship of the Ring (The Lord of the Ri...
6798,0618260269,0.080425,118,The Fellowship of the Ring (The Lord of the Ri...
1761,0345337662,0.073407,152,Interview with the Vampire


### User Recommendation Based On ItemCF

In [215]:
def recommend_books_itemcf_for_user(user_id, train_df, top_n=10):
    # 1. Get all books the user rated highly
    user_books = train_df[train_df["user_id"] == user_id]
    user_books = user_books[user_books["book_rating"] >= 4]["isbn"]

    recs = pd.DataFrame()
    for isbn in user_books:
        recs = pd.concat([recs, recommend_books_cosine(isbn, top_n=5)])

    # aggregate by similarity score, drop books already seen
    recs = recs.groupby("isbn")["similarity"].mean().reset_index()
    recs = recs[~recs["isbn"].isin(user_books)]
    return recs.sort_values("similarity", ascending=False).head(top_n)

# User Based Collaborative Filtering using Cosine Similarity

- Compute **cosine similarity** between users (rows of the user–item matrix).  
- For a target user:
  1. Find the most similar users.  
  2. Aggregate their ratings (weighted by similarity).  
  3. Recommend items the target user hasn’t seen.  

In [216]:
# user-item matrix (rows = users, cols = items)

# cosine similarity between users
user_sim_matrix = cosine_similarity(
    sparse_user_item, dense_output=False
)  # n_users × n_users

# mappings for user_id
user_list = user_item_matrix.index.tolist()
user_to_idx = {uid: i for i, uid in enumerate(user_list)}
idx_to_user = {i: uid for uid, i in user_to_idx.items()}

isbn_list = user_item_matrix.columns.tolist()
isbn_to_idx = {isbn: i for i, isbn in enumerate(isbn_list)}
idx_to_isbn = {i: isbn for isbn, i in isbn_to_idx.items()}

In [217]:
def recommend_books_userCF(user_id, top_n=10, min_ratings=50):
    if user_id not in user_to_idx:
        return None

    uidx = user_to_idx[user_id]

    # similarity scores between this user and all others
    sim_scores = user_sim_matrix[uidx].toarray().ravel()

    # find top similar users (excluding self)
    similar_users = np.argsort(sim_scores)[::-1][1:50]  # take top 50 similar users

    # collect candidate books
    user_vector = sparse_user_item[uidx].toarray().ravel()
    seen_items = set(np.where(user_vector > 0)[0])

    scores = {}
    for sim_u in similar_users:
        weight = sim_scores[sim_u]
        sim_user_vector = sparse_user_item[sim_u].toarray().ravel()
        for item_idx, rating in enumerate(sim_user_vector):
            if rating > 0 and item_idx not in seen_items:
                scores[item_idx] = scores.get(item_idx, 0) + weight * rating

    # sort candidate books
    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # convert back to ISBN
    recs = []
    for item_idx, score in ranked_items:
        isbn = idx_to_isbn[item_idx]
        # join metadata
        title = books.loc[books["isbn"] == isbn, "title"].values[0]
        num_ratings = books.loc[books["isbn"] == isbn, "num_of_rating"].values[0]
        if num_ratings >= min_ratings:
            recs.append((isbn, title, score))
        if len(recs) >= top_n:
            break

    return pd.DataFrame(recs, columns=["isbn", "title", "score"])

In [218]:
recommended_books = recommend_books_userCF("6251")
display(recommended_books)

Unnamed: 0,isbn,title,score
0,0439064864,Harry Potter and the Chamber of Secrets (Book 2),14.864618
1,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),12.334247
2,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,7.391636
3,0439064872,Harry Potter and the Chamber of Secrets (Book 2),3.16073
4,0156027321,Life of Pi,2.400377
5,043936213X,Harry Potter and the Sorcerer's Stone (Book 1),2.374774
6,0440418321,"The Golden Compass (His Dark Materials, Book 1)",1.691836
7,0064407667,The Bad Beginning (A Series of Unfortunate Eve...,1.689
8,067088300X,The Girls' Guide to Hunting and Fishing,1.683472
9,0345339703,The Fellowship of the Ring (The Lord of the Ri...,1.509865


# evaluation

In [219]:
gt = test_df.groupby("user_id")["isbn"].apply(set).to_dict()

In [220]:
import math
import numpy as np


def evaluate_model(recommender_fn, users, K=10):
    precisions, recalls, ndcgs = [], [], []

    for uid in users:
        if uid not in gt:
            continue
        true_items = gt[uid]
        preds = recommender_fn(uid, top_n=K)

        if preds is None or preds.empty:
            continue
        pred_items = preds["isbn"].tolist()

        # hits
        hits = [1 if item in true_items else 0 for item in pred_items]
        n_hits = sum(hits)

        # precision
        prec = n_hits / K
        # recall
        rec = n_hits / len(true_items)

        # NDCG
        dcg = sum([hits[i] / math.log2(i + 2) for i in range(len(hits))])
        ideal_hits = min(len(true_items), K)
        idcg = sum([1.0 / math.log2(i + 2) for i in range(ideal_hits)])
        ndcg = dcg / idcg if idcg > 0 else 0

        precisions.append(prec)
        recalls.append(rec)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

In [221]:
def usercf_wrapper(uid, top_n=10):
    recs = recommend_books_userCF(uid, top_n=top_n, min_ratings=50)
    return recs

def recommend_books_itemcf_for_user_wrapper(uid, top_n=10):
    recs = recommend_books_itemcf_for_user(uid, train_df, top_n=top_n)
    return recs

## Evaluation Metrics for Recommender Systems

When evaluating recommendation models, we want to measure **how accurate** and **how well-ranked** the suggested items are.  
We use three common metrics: **Precision@K, Recall@K, and NDCG@K**.



### 1. Precision@K
- **Definition:** Fraction of the top-𝐾 recommended items that are actually relevant (appear in the user’s test set).
- **Formula:**
  $$
  \text{Precision@K} = \frac{\text{ of relevant items in top-K}}{K}
  $$
- **Intuition:** "Out of the top-𝐾 books I recommended, how many were correct?"

High precision means the system recommends fewer irrelevant items.



### 2. Recall@K
- **Definition:** Fraction of the user’s relevant items that were successfully recommended in the top-𝐾.
- **Formula:**
  $$
  \text{Recall@K} = \frac{\text{of relevant items in top-K}}{\text{Total of relevant items}}
  $$
- **Intuition:** "How many of the books the user actually liked did I manage to recommend?"

High recall means the system captures more of the user’s true interests.



### 3. NDCG@K (Normalized Discounted Cumulative Gain)
- **Definition:** Measures ranking quality by giving **higher weight to relevant items at the top** of the recommendation list.
- **Formula:**
  $$
  DCG@K = \sum_{i=1}^K \frac{rel_i}{\log_2(i+1)}
  $$
  $$
  NDCG@K = \frac{DCG@K}{IDCG@K}
  $$
  where:
  - \(rel_i = 1\) if the item at rank *i* is relevant, else 0
  - \(IDCG@K\) = best possible DCG if all relevant items were ranked perfectly at the top
- **Intuition:** "Did I rank the relevant books at the top of the list, where the user is most likely to notice them?"

✅ NDCG ranges from 0 to 1.  
- 1 = perfect ranking (all relevant items at the top).  
- 0 = no relevant items found.



### 🔑 Summary
- **Precision@K** → Accuracy of the recommendations.  
- **Recall@K** → Coverage of relevant items.  
- **NDCG@K** → Ranking quality (position matters).  


In [226]:


# pick a sample of users for speed
sample_users = list(gt.keys())[:500]

# evaluate both models
print("Evaluating on", len(sample_users), "users...")
prec_user, rec_user, ndcg_user = evaluate_model(usercf_wrapper, sample_users, K=20)
prec3, rec3, ndcg3 = evaluate_model(recommend_books_itemcf_for_user_wrapper, sample_users, K=20)

print(
    f"UserCF → Precision@10: {prec_user:.3f}, Recall@10: {rec_user:.3f}, NDCG@10: {ndcg_user:.3f}"
)
print(
    f"ItemCF for User → Precision@10: {prec3:.3f}, Recall@10: {rec3:.3f}, NDCG@10: {ndcg3:.3f}"
)

Evaluating on 500 users...


KeyboardInterrupt: 