In [325]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix

In [326]:
df = pd.read_csv("../data/cleaned_book_ratings_plus.csv")
df["user_id"] = df["user_id"].astype(str)
df["isbn"] = df["isbn"].astype(str)
books=df.drop_duplicates(subset='isbn')[['isbn','book_rating','title','author','year','publisher','img_url','num_of_rating','avg_book_rate','weighted_score']]
users=df.drop_duplicates(subset='user_id')[['user_id','user_age','location','fav_author','fav_publisher']]

print("rows,cols:", df.shape)

rows,cols: (11895, 16)


In [328]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11895 entries, 0 to 11894
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      11895 non-null  int64  
 1   user_id         11895 non-null  object 
 2   isbn            11895 non-null  object 
 3   book_rating     11895 non-null  int64  
 4   location        11895 non-null  object 
 5   user_age        11895 non-null  float64
 6   title           11895 non-null  object 
 7   author          11895 non-null  object 
 8   year            11895 non-null  float64
 9   publisher       11895 non-null  object 
 10  img_url         11895 non-null  object 
 11  num_of_rating   11895 non-null  int64  
 12  fav_author      11895 non-null  object 
 13  fav_publisher   11895 non-null  object 
 14  avg_book_rate   11895 non-null  float64
 15  weighted_score  11895 non-null  float64
dtypes: float64(4), int64(3), object(9)
memory usage: 1.5+ MB


# Popularity filtering

In [329]:
top_rated_books = df.sort_values("weighted_score", ascending=False)
top_rated_books=top_rated_books.drop_duplicates(subset=['isbn'])

In [420]:
def get_top_n_books(n):
    """
    Returns the top n books from top_rated_books DataFrame
    with columns: title, num_of_rating, avg_book_rate, weighted_score.
    """
    return top_rated_books[
        ["title", "num_of_rating", "avg_book_rate", "weighted_score"]
    ].head(n)

In [421]:
get_top_n_books(10)

Unnamed: 0,title,num_of_rating,avg_book_rate,weighted_score
8130,Harry Potter and the Goblet of Fire (Book 4),72,9.512195,9.302108
10187,Harry Potter and the Prisoner of Azkaban (Book 3),70,9.333333,9.146198
1219,"The Return of the King (The Lord of the Rings,...",19,9.789474,9.118784
1658,Where the Red Fern Grows,17,9.588235,8.960036
4348,Harry Potter and the Prisoner of Azkaban (Book 3),70,9.083333,8.935355
789,Harry Potter and the Order of the Phoenix (Boo...,63,9.079365,8.918435
2345,To Kill a Mockingbird,57,9.087719,8.911444
4384,Harry Potter and the Sorcerer's Stone (Harry P...,55,9.090909,8.90884
34,"The Two Towers (The Lord of the Rings, Part 2)",18,9.444444,8.896809
7765,"Lamb : The Gospel According to Biff, Christ's ...",16,9.5,8.889693


# Data Split

Split ratings into training set and test set so that:

- Each user has at least num_of_rates_in_test ratings placed in the test set.

- The rest of the ratings stay in the train set.

This ensures every user appears in both train and test, which is crucial for evaluating recommendation systems.

In [422]:
def train_test_split_one_in_test(df,num_of_rates_in_test=1):
    grouped=df.groupby('user_id')
    train_list=[]
    test_list=[]
    for uid,g in grouped:
        if len(g) <= num_of_rates_in_test:
            test_list.append(g.index.values)
            continue

        test_idx = g.sample(n=num_of_rates_in_test,random_state=42).index
        train_idx = g.index.difference(test_idx)
        test_list.extend(list(test_idx))
        train_list.extend(list(train_idx))

    train_df=df.loc[train_list]
    test_df=df.loc[test_list]
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [423]:
train_df, test_df=train_test_split_one_in_test(df)
print("train:", train_df.shape, "test:", test_df.shape)

train: (11293, 16) test: (602, 16)


# Item Based Collaborative Filtering

In [424]:
user_item_matrix = train_df.pivot_table(
    index="user_id", columns="isbn", values="book_rating", fill_value=0
)

In [425]:
user_item_matrix

isbn,002542730X,0060096195,006016848X,0060199652,0060391626,0060392452,0060502258,0060915544,0060916508,0060920084,...,1558743669,1558744150,1558745157,1565122968,1573225517,1573225789,1573229326,1573229571,1592400876,1878424319
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100459,0.0,0.0,0.0,8.0,0.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## based on Correlation

In [426]:
def recommend_books(book_isbn,min_ratings):
    book_ratings = user_item_matrix[book_isbn]
    similar_books = user_item_matrix.corrwith(book_ratings)
    corr_book = pd.DataFrame(similar_books, columns=["pearsonR"])
    corr_book.dropna(inplace=True)
    
    meta = books.set_index("isbn")[["num_of_rating",'title']]
    corr_book = corr_book.join(meta)

    top_books=corr_book[corr_book['num_of_rating']>=min_ratings]
    top_books = top_books.sort_values("pearsonR", ascending=False).head(10)

    return top_books
# recommended_books = recommend_books('059035342X', 100)
# display(recommended_books)

## based on Cosine similarity


In [427]:

sparse_user_item = csr_matrix(user_item_matrix.values)

In [428]:
sparse_user_item

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11293 stored elements and shape (602, 629)>

In [429]:
# item-user matrix (transpose so rows = items, cols = users)
item_user_matrix = sparse_user_item.T  # shape: n_items × n_users

# cosine similarity between items
item_sim_matrix = cosine_similarity(item_user_matrix, dense_output=False)

# map ISBNs to column indices
isbn_list = user_item_matrix.columns.tolist()
isbn_to_idx = {isbn: i for i, isbn in enumerate(isbn_list)}

In [430]:
def recommend_books_cosine(book_isbn, min_ratings=50, top_n=10):
    if book_isbn not in isbn_to_idx:
        return pd.DataFrame()  # return empty DataFrame instead of None

    idx = isbn_to_idx[book_isbn]
    # similarity scores for this book vs all others
    sim_scores = item_sim_matrix[idx].toarray().ravel()

    # build DataFrame of scores (use "similarity" instead of "cosine_sim")
    corr_book = pd.DataFrame({"isbn": isbn_list, "similarity": sim_scores})

    # drop self (the book itself will have sim=1)
    corr_book = corr_book[corr_book["isbn"] != book_isbn]

    # join metadata (safe join)
    meta = books.set_index("isbn")[["num_of_rating", "title"]]
    corr_book = corr_book.set_index("isbn").join(meta, how="inner").reset_index()

    # filter and sort
    top_books = corr_book[corr_book["num_of_rating"] >= min_ratings]
    top_books = top_books.sort_values("similarity", ascending=False).head(top_n)

    return top_books


# Example
recommended_books = recommend_books_cosine("059035342X", min_ratings=100)
display(recommended_books)

Unnamed: 0,isbn,similarity,num_of_rating,title
105,316666343,0.118338,114,The Lovely Bones: A Novel


### User Recommendation Based On ItemCF

- Compute **cosine similarity** between items (columns of the user–item matrix).  
- For a target user:
  1. Look at books the user rated highly.  
  2. Find similar books (using cosine similarity).  
  3. Aggregate across multiple liked books.  
  4. Recommend the most similar unseen books.  

In [431]:
def recommend_books_itemcf_for_user(user_id, train_df, top_n=10):
    # 1. Get all books the user rated highly
    user_books = train_df[train_df["user_id"] == user_id]
    user_books = user_books[user_books["book_rating"] >= 4]["isbn"]

    recs = pd.DataFrame()
    for isbn in user_books:
        recs = pd.concat([recs, recommend_books_cosine(isbn, top_n=5)])

    # aggregate by similarity score, drop books already seen
    recs = recs.groupby("isbn")["similarity"].mean().reset_index()
    recs = recs[~recs["isbn"].isin(user_books)]
    return recs.sort_values("similarity", ascending=False).head(top_n)

# User Based Collaborative Filtering using Cosine Similarity

- Compute **cosine similarity** between users (rows of the user–item matrix).  
- For a target user:
  1. Find the most similar users.  
  2. Aggregate their ratings (weighted by similarity).  
  3. Recommend items the target user hasn’t seen.  

In [432]:
# user-item matrix (rows = users, cols = items)

# cosine similarity between users
user_sim_matrix = cosine_similarity(
    sparse_user_item, dense_output=False
)  # n_users × n_users

# mappings for user_id
user_list = user_item_matrix.index.tolist()
user_to_idx = {uid: i for i, uid in enumerate(user_list)}
idx_to_user = {i: uid for uid, i in user_to_idx.items()}

isbn_list = user_item_matrix.columns.tolist()
isbn_to_idx = {isbn: i for i, isbn in enumerate(isbn_list)}
idx_to_isbn = {i: isbn for isbn, i in isbn_to_idx.items()}

In [433]:
# Precompute lookups (outside function, once)
isbn_to_title = dict(zip(books["isbn"], books["title"]))
isbn_to_num = dict(zip(books["isbn"], books["num_of_rating"]))


def recommend_books_userCF(user_id, top_n=10, min_ratings=50):
    if user_id not in user_to_idx:
        return None

    uidx = user_to_idx[user_id]

    # similarity scores between this user and all others
    sim_scores = user_sim_matrix[uidx].toarray().ravel()

    # find top similar users (excluding self)
    similar_users = np.argsort(sim_scores)[::-1][1:25]

    # collect seen items
    seen_items = set(sparse_user_item[uidx].indices)

    scores = {}
    for sim_u in similar_users:
        weight = sim_scores[sim_u]
        # get only nonzero entries for this user
        sim_user_items = sparse_user_item[sim_u].indices
        sim_user_ratings = sparse_user_item[sim_u].data
        for item_idx, rating in zip(sim_user_items, sim_user_ratings):
            if item_idx not in seen_items:
                scores[item_idx] = scores.get(item_idx, 0) + weight * rating

    # sort candidate books
    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # build dataframe
    recs = []
    for item_idx, score in ranked_items:
        isbn = idx_to_isbn[item_idx]
        if isbn_to_num.get(isbn, 0) >= min_ratings:
            recs.append((isbn, isbn_to_title.get(isbn, "Unknown"), score))
        if len(recs) >= top_n:
            break

    return pd.DataFrame(recs, columns=["isbn", "title", "score"])

In [434]:
recommended_books = recommend_books_userCF("6251")
display(recommended_books)

Unnamed: 0,isbn,title,score
0,0439064864,Harry Potter and the Chamber of Secrets (Book 2),26.379893
1,0446310786,To Kill a Mockingbird,8.06036
2,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,5.851753
3,0316666343,The Lovely Bones: A Novel,5.795726
4,0312195516,The Red Tent (Bestselling Backlist),5.020276
5,0142001740,The Secret Life of Bees,4.064008
6,0312278586,The Nanny Diaries: A Novel,3.381336
7,0446672211,Where the Heart Is (Oprah's Book Club (Paperba...,3.052909
8,0385504209,The Da Vinci Code,2.987927
9,0345370775,Jurassic Park,2.839533


# Other Approach 

### Overview
The content-based recommender suggests books to a user based on the features of the books they have liked in the past.
It builds a user profile by aggregating the features of previously liked books and recommends books that are most similar in content.

### Steps

#### 1. Feature Extraction
- **Title:** TF-IDF vectorization (`min_df=2`, `max_df=0.7`)  
- **Author & Publisher:** One-hot encoding  
- **Year:** Min-Max scaling  

#### 2. User Profile Construction
- Select books the user rated above a threshold (e.g., 5).  
- Compute the mean vector of the selected books’ features to represent the user profile.  

#### 3. Recommendation
- Compute cosine similarity between the user profile and all other books.  
- Exclude books the user has already rated.  
- Return the top-k most similar books.


In [440]:
# create tfidfvectorizer to generate features for titles
# min_df = 2 -> make a word features if it only accours at least twice
# max_df = 0.7 -> if word appear in more than 70% of titles then ingore it
tfidvec = TfidfVectorizer(min_df=2, max_df=0.7)
vectorized_titles = tfidvec.fit_transform(books.title)

# create datafram and put features for each book in it
books_features = pd.DataFrame(
    vectorized_titles.toarray(),
    columns=tfidvec.get_feature_names_out(),
    index=books["isbn"],
)

In [441]:
books_features.head()

Unnamed: 0_level_0,20th,2nd,about,adventure,agency,air,alex,all,america,american,...,women,world,wrinkle,ya,year,years,you,young,your,zone
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0061009059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0316776963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0345413903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0385424736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [442]:
# encode author using one hot encoding and add it as features
encoder = OneHotEncoder()
encoded_authors = encoder.fit_transform(books[["author"]])
encoded_authors = pd.DataFrame(
    encoded_authors.toarray(),
    columns=encoder.get_feature_names_out(),
    index=books["isbn"],
)
books_features = pd.concat((books_features, encoded_authors), axis=1)

In [443]:
# encode publisher using one hot encoding and add it as features
encoder = OneHotEncoder()
encoded_publisher = encoder.fit_transform(books[["publisher"]])
encoded_publisher = pd.DataFrame(
    encoded_publisher.toarray(),
    columns=encoder.get_feature_names_out(),
    index=books["isbn"],
)
books_features = pd.concat((books_features, encoded_publisher), axis=1)

In [444]:
# for year we use minmaxscaler as it is numerical column
scaler = MinMaxScaler()
scaled_numric = scaler.fit_transform(books[["year"]])
scaled_numric = pd.DataFrame(scaled_numric, columns=["year"], index=books["isbn"])
books_features = pd.concat((books_features, scaled_numric), axis=1)

In [445]:
books_features

Unnamed: 0_level_0,20th,2nd,about,adventure,agency,air,alex,all,america,american,...,publisher_Warner Books,publisher_Warner Forever,publisher_Warner Vision,publisher_Washington Square Press,publisher_William Morrow &amp; Company,publisher_Workman Pub Co,publisher_Workman Publishing,publisher_Yearling,publisher_Yearling Books,year
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.787234
0061009059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.808511
0316776963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.936170
0345413903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.978723
0385424736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.808511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0451167538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000
0671038443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.914894
0060920084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463045,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702128
0553211404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553191


In [446]:
def find_similar_books_content_based(book_isbn, num_books):
    # compute cosine similarity between the given book and all other books
    sim_scores = cosine_similarity(
        books_features.loc[book_isbn].values.reshape(1, -1),
        books_features.drop(book_isbn, axis=0),
    )

    # convert similarity scores into DataFrame with book ISBNs as index
    sim_scores = pd.DataFrame(
        sim_scores[0],
        index=books_features.drop(book_isbn, axis=0).index,
        columns=["score"],
    )

    # return top N most similar books sorted by similarity score
    return sim_scores.sort_values("score", ascending=False).head(num_books)

In [447]:
find_similar_books_content_based("059035342X", 10)

Unnamed: 0_level_0,score
isbn,Unnamed: 1_level_1
0590353403,0.704027
043935806X,0.667258
0439139600,0.620498
0439064872,0.619743
0439064864,0.617868
0439139597,0.616947
0439136369,0.616134
0439136350,0.612406
0425154092,0.304576
0345459202,0.269766


In [448]:
def recommend_for_user_content_based(user_id, num_books):
    # keep only books the user rated above 5
    user_pervious_books = train_df[
        (train_df["user_id"] == user_id) & (train_df["book_rating"] > 5)
    ]

    # if user has no such books, return empty recommendations
    if user_pervious_books.empty:
        return pd.DataFrame([], columns=["score"])

    # take features for these books
    user_pervious_books = books_features.loc[user_pervious_books["isbn"]]

    # build user profile
    user_pervious_books_mean = user_pervious_books.mean().values.reshape(1, -1)

    # compute similarity with all other books
    sim_scores = cosine_similarity(
        user_pervious_books_mean, books_features.drop(user_pervious_books.index, axis=0)
    )

    # wrap results
    sim_scores = pd.DataFrame(
        sim_scores[0],
        index=books_features.drop(user_pervious_books.index, axis=0).index,
        columns=["score"],
    )

    return sim_scores.sort_values("score", ascending=False).head(num_books).index

## User Based Colaporative Filtering

### Overview
The collaborative filtering recommender suggests books to a user based on the ratings of similar users. It finds users with similar taste and recommends books they liked that the target user hasn’t read yet.

### Steps

#### 1. User-Book Matrix
- Pivot the dataset to create a matrix with `user_id` as rows, `isbn` as columns, and `book_rating` as values.  
- Fill missing ratings with the user’s mean rating.  

#### 2. User Similarity
- Compute cosine similarity between the target user and all other users.  

#### 3. Top-N Recommendations
- Identify top-k most similar users.  
- Compute mean ratings of books from these users.  
- Exclude books already rated by the target user.  
- Return top-k books with the highest mean ratings.


In [452]:
# create pivot tabel user accros book
user_book_pivot = train_df.pivot(index="user_id", columns="isbn", values="book_rating")

In [453]:
# fill null values with mean for each user
user_book_pivot = user_book_pivot.apply(lambda row: row.fillna(row.mean()), axis=1)

In [454]:
def recommend_for_user_colaporative_filtering(user_id, num_books):
    # Compute cosine similarity between the target user and all other users
    sim = cosine_similarity(
        user_book_pivot.loc[user_id].values.reshape(1, -1),
        user_book_pivot.drop(user_id, axis=0).values,
    )

    # Store similarities in a DataFrame, indexed by user_id
    users_score = pd.DataFrame(
        sim.reshape(-1, 1),
        columns=["score"],
        index=user_book_pivot.drop(user_id, axis=0).index,
    )

    # Pick top-5 most similar users
    top_users = users_score.sort_values("score", ascending=False).head(25).index

    # Average their ratings for each book
    mean_ratings = user_book_pivot.loc[top_users].mean(axis=0)

    # Exclude user rated books
    mean_ratings = mean_ratings[
        ~mean_ratings.index.isin(train_df[train_df["user_id"] == user_id]["isbn"])
    ]

    # Return top books with highest mean rating
    return mean_ratings.sort_values(ascending=False).head(num_books).index

# Evaluation

## Evaluation Metrics for Recommender Systems

When evaluating recommendation models, we want to measure **how accurate** and **how well-ranked** the suggested items are.  
We use three common metrics: **Precision@K, Recall@K, and NDCG@K**.



### 1. Precision@K
- **Definition:** Fraction of the top-𝐾 recommended items that are actually relevant (appear in the user’s test set).
- **Formula:**
  $$
  \text{Precision@K} = \frac{\text{ of relevant items in top-K}}{K}
  $$
- **Intuition:** "Out of the top-𝐾 books I recommended, how many were correct?"

High precision means the system recommends fewer irrelevant items.



### 2. Recall@K
- **Definition:** Fraction of the user’s relevant items that were successfully recommended in the top-𝐾.
- **Formula:**
  $$
  \text{Recall@K} = \frac{\text{of relevant items in top-K}}{\text{Total of relevant items}}
  $$
- **Intuition:** "How many of the books the user actually liked did I manage to recommend?"

High recall means the system captures more of the user’s true interests.



### 3. NDCG@K (Normalized Discounted Cumulative Gain)
- **Definition:** Measures ranking quality by giving **higher weight to relevant items at the top** of the recommendation list.
- **Formula:**
  $$
  DCG@K = \sum_{i=1}^K \frac{rel_i}{\log_2(i+1)}
  $$
  $$
  NDCG@K = \frac{DCG@K}{IDCG@K}
  $$
  where:
  - \(rel_i = 1\) if the item at rank *i* is relevant, else 0
  - \(IDCG@K\) = best possible DCG if all relevant items were ranked perfectly at the top
- **Intuition:** "Did I rank the relevant books at the top of the list, where the user is most likely to notice them?"

✅ NDCG ranges from 0 to 1.  
- 1 = perfect ranking (all relevant items at the top).  
- 0 = no relevant items found.



### 🔑 Summary
- **Precision@K** → Accuracy of the recommendations.  
- **Recall@K** → Coverage of relevant items.  
- **NDCG@K** → Ranking quality (position matters).  


In [458]:
def recommend_books_userCF_list(user_id, top_n=10, min_ratings=50):
    recs = recommend_books_userCF(user_id, top_n, min_ratings)
    if recs is None or recs.empty:
        return []
    return recs["isbn"].tolist()


def recommend_books_itemcf_list(user_id, top_n=10):
    recs = recommend_books_itemcf_for_user(user_id, train_df, top_n=top_n)
    if recs is None or recs.empty:
        return []
    return recs["isbn"].tolist()


def recommend_books_content_list(user_id, top_n=10):
    recs = recommend_for_user_content_based(user_id, top_n)
    if recs is None or len(recs) == 0:
        return []
    return list(recs)


def recommend_for_user_colaborative_list(user_id, top_n=10):
    try:
        recs = recommend_for_user_colaporative_filtering(user_id, num_books=top_n)
    except KeyError:
        return []
    return list(recs)

In [459]:
def evaluate_model_list(recommender_fn, users, K=10):
    precisions, recalls, ndcgs = [], [], []
    for uid in users:
        if uid not in gt:
            continue
        true_items = gt[uid]
        pred_items = recommender_fn(uid, top_n=K)
        if not pred_items:
            continue

        # hits
        hits = [1 if item in true_items else 0 for item in pred_items]
        n_hits = sum(hits)

        # precision & recall
        prec = n_hits / K
        rec = n_hits / len(true_items) if len(true_items) > 0 else 0

        # NDCG
        dcg = sum([hits[i] / math.log2(i + 2) for i in range(len(hits))])
        ideal_hits = min(len(true_items), K)
        idcg = sum([1.0 / math.log2(i + 2) for i in range(ideal_hits)])
        ndcg = dcg / idcg if idcg > 0 else 0

        precisions.append(prec)
        recalls.append(rec)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

In [460]:
gt = test_df.groupby("user_id")["isbn"].apply(set).to_dict()

In [463]:
sample_users = list(gt.keys())[:200]

print("Evaluating on", len(sample_users), "users...")

p_u, r_u, n_u = evaluate_model_list(recommend_books_userCF_list, sample_users, K=20)
p_cf, r_cf, n_cf = evaluate_model_list(
    recommend_for_user_colaborative_list, sample_users, K=20
)
p_i, r_i, n_i = evaluate_model_list(recommend_books_itemcf_list, sample_users, K=20)
p_c, r_c, n_c = evaluate_model_list(recommend_books_content_list, sample_users, K=20)

print(f"UserCF   → Precision@20: {p_u:.3f}, Recall@20: {r_u:.3f}, NDCG@20: {n_u:.3f}")
print(
    f"Pivot-based UserCF → Precision@20: {p_cf:.3f}, Recall@20: {r_cf:.3f}, NDCG@20: {n_cf:.3f}"
)
print(f"ItemCF   → Precision@20: {p_i:.3f}, Recall@20: {r_i:.3f}, NDCG@20: {n_i:.3f}")
print(f"Content  → Precision@20: {p_c:.3f}, Recall@20: {r_c:.3f}, NDCG@20: {n_c:.3f}")

Evaluating on 200 users...
UserCF   → Precision@20: 0.006, Recall@20: 0.125, NDCG@20: 0.053
Pivot-based UserCF → Precision@20: 0.003, Recall@20: 0.060, NDCG@20: 0.018
ItemCF   → Precision@20: 0.007, Recall@20: 0.130, NDCG@20: 0.062
Content  → Precision@20: 0.008, Recall@20: 0.161, NDCG@20: 0.071


# Summary of Implemented Recommendation Functions

We built **four different recommender system functions**, each with a unique approach to generating book suggestions:

---

## 1. `recommend_books_userCF_list`
- **Approach:** User-Based Collaborative Filtering (sparse matrix + cosine similarity).
- **How it works:**
  - Computes similarities between users based on their ratings.
  - Finds top-k most similar users.
  - Aggregates ratings of similar users to recommend new books.
- **Strengths:**
  - Personalized recommendations based on community behavior.
  - Efficient due to sparse matrix operations.


## 2. `recommend_for_user_colaborative_list`
- **Approach:** User-Based Collaborative Filtering (pivot table + mean imputation).
- **How it works:**
  - Builds a full user–item matrix with missing values filled by each user’s mean rating.
  - Computes cosine similarity between users.
  - Uses ratings of top-k similar users to recommend books.
- **Strengths:**
  - Conceptually simple and easier to implement.
  - Handles missing values via mean-filling.


## 3. `recommend_books_itemcf_list`
- **Approach:** Item-Based Collaborative Filtering.
- **How it works:**
  - Computes cosine similarity between items (books) based on user rating patterns.
  - For a target user, finds books similar to the ones they rated highly.
  - Aggregates similarity scores across multiple liked books.
- **Strengths:**
  - Effective when item similarities are strong.
  - Recommendations are often interpretable ("similar to book X").


## 4. `recommend_books_content_list`
- **Approach:** Content-Based Filtering.
- **How it works:**
  - Extracts features from books (TF-IDF of title, one-hot encoding of author/publisher, scaled year).
  - Builds a user profile from features of books they rated highly.
  - Recommends books most similar in feature space.
- **Strengths:**
  - Can recommend new/unrated books (cold-start friendly for items).
  - Personalized to user’s preferences based on book metadata.
