In [None]:
import pandas as pd
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv("../data/cleaned_book_ratings.csv")
print("Shape:", df.shape)
df.head()

Shape: (11895, 11)


Unnamed: 0,user_id,isbn,book_rating,location,user_age,title,author,year,publisher,img_url,num_of_rating
0,277427,002542730X,10,"gilbert, arizona, usa",48.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994.0,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,15
1,277427,0061009059,9,"gilbert, arizona, usa",48.0,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995.0,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,26
2,277427,0316776963,8,"gilbert, arizona, usa",48.0,Me Talk Pretty One Day,David Sedaris,2001.0,Back Bay Books,http://images.amazon.com/images/P/0316776963.0...,42
3,277427,0345413903,10,"gilbert, arizona, usa",48.0,The Murder Book,Jonathan Kellerman,2003.0,Ballantine Books,http://images.amazon.com/images/P/0345413903.0...,16
4,277427,0385424736,9,"gilbert, arizona, usa",48.0,The Rainmaker,John Grisham,1995.0,Doubleday Books,http://images.amazon.com/images/P/0385424736.0...,36


# Feature Engineering

## User-level Features

In [3]:
# --- Favorite Author ---
# Compute average rating per user-author pair
user_author_mean = df.groupby(["user_id", "author"])["book_rating"].mean().reset_index()

# For each user, find the author with the max average rating
fav_author = user_author_mean.loc[user_author_mean.groupby("user_id")["book_rating"].idxmax()]

# Keep only user_id and author
fav_author = fav_author[["user_id", "author"]].rename(columns={"author": "fav_author"}).reset_index(drop=True)
fav_author

Unnamed: 0,user_id,fav_author
0,254,Neil Gaiman
1,638,Alice Sebold
2,1435,Amy Tan
3,2766,Harper Lee
4,4017,ANNA QUINDLEN
...,...,...
597,274308,Orson Scott Card
598,276050,J.R.R. TOLKIEN
599,276165,J. K. Rowling
600,276680,Jeffrey Eugenides


In [4]:
# --- Favorite Publisher ---

# Count number of books rated by each user-publisher pair
user_publisher_count = df.groupby(["user_id", "publisher"])["book_rating"].count().reset_index(name="count")

# For each user, get the publisher with the max count
fav_publisher = user_publisher_count.loc[user_publisher_count.groupby("user_id")["count"].idxmax()]

# Keep only user_id and publisher
fav_publisher = fav_publisher[["user_id", "publisher"]].rename(columns={"publisher": "fav_publisher"}).reset_index(drop=True)
fav_publisher

Unnamed: 0,user_id,fav_publisher
0,254,Scholastic
1,638,"Little, Brown"
2,1435,Ballantine Books
3,2766,Dell
4,4017,Perennial
...,...,...
597,274308,Ballantine Books
598,276050,Ballantine Books
599,276165,Scholastic
600,276680,Ballantine Books


## Books-level-Features

In [5]:
# --- average rating ---
avg_book_rating=df.groupby('isbn')['book_rating'].mean().reset_index(name='avg_book_rate')
avg_book_rating

Unnamed: 0,isbn,avg_book_rate
0,002542730X,7.866667
1,0060096195,8.153846
2,006016848X,7.454545
3,0060199652,8.461538
4,0060391626,8.466667
...,...,...
624,1573225789,8.466667
625,1573229326,7.000000
626,1573229571,8.000000
627,1592400876,8.785714


In [6]:
num_rating=df[['isbn','num_of_rating']].drop_duplicates()
book_info=avg_book_rating.merge(num_rating,on='isbn',how='inner')
book_info

Unnamed: 0,isbn,avg_book_rate,num_of_rating
0,002542730X,7.866667,15
1,0060096195,8.153846,13
2,006016848X,7.454545,11
3,0060199652,8.461538,13
4,0060391626,8.466667,15
...,...,...,...
624,1573225789,8.466667,15
625,1573229326,7.000000,20
626,1573229571,8.000000,15
627,1592400876,8.785714,14



### Popularity / Weighted Score (IMDb-style)

**Formula:**


$$
\text{score} = \frac{v}{v+m}R + \frac{m}{v+m}C
$$

Where:
- **R** = average rating for the book  
- **v** = number of ratings for the book  
- **m** = minimum ratings threshold   
- **C** = global average rating across all books  

**How it works:**
- If a book has **few ratings**, the score leans closer to **C** (the overall average).  
- If a book has **many ratings**, the score leans closer to its true **R**.  

👉 This prevents books with just a few perfect ratings from unfairly dominating the ranking.


In [7]:
print(book_info['num_of_rating'].describe())
m = 13     # minimum ratings threshold
# - Chosen m as 13 because from our dataset stats, 75% of books have <= 13 ratings
# - This ensures that books with very few ratings do not dominate the weighted score

C=df['book_rating'].mean()  # global average rating across all books  

count    629.000000
mean      21.736089
std       13.323522
min       11.000000
25%       13.000000
50%       16.000000
75%       27.000000
max      114.000000
Name: num_of_rating, dtype: float64


In [8]:
book_info['weighted_score']=book_info['num_of_rating']*book_info['avg_book_rate']/(book_info['num_of_rating']+m) + m*C/(book_info['num_of_rating']+m)
book_info

Unnamed: 0,isbn,avg_book_rate,num_of_rating,weighted_score
0,002542730X,7.866667,15,7.992896
1,0060096195,8.153846,13,8.146196
2,006016848X,7.454545,11,7.825046
3,0060199652,8.461538,13,8.300042
4,0060391626,8.466667,15,8.314325
...,...,...,...,...
624,1573225789,8.466667,15,8.314325
625,1573229326,7.000000,20,7.448518
626,1573229571,8.000000,15,8.064325
627,1592400876,8.785714,14,8.474115


# New Dataset

In [9]:
df_plus=df

In [10]:
df_plus=df_plus.merge(fav_author,on='user_id',how='left').merge(fav_publisher,on='user_id',how='left')
df_plus

Unnamed: 0,user_id,isbn,book_rating,location,user_age,title,author,year,publisher,img_url,num_of_rating,fav_author,fav_publisher
0,277427,002542730X,10,"gilbert, arizona, usa",48.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994.0,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,15,James Finn Garner,Anchor
1,277427,0061009059,9,"gilbert, arizona, usa",48.0,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995.0,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,26,James Finn Garner,Anchor
2,277427,0316776963,8,"gilbert, arizona, usa",48.0,Me Talk Pretty One Day,David Sedaris,2001.0,Back Bay Books,http://images.amazon.com/images/P/0316776963.0...,42,James Finn Garner,Anchor
3,277427,0345413903,10,"gilbert, arizona, usa",48.0,The Murder Book,Jonathan Kellerman,2003.0,Ballantine Books,http://images.amazon.com/images/P/0345413903.0...,16,James Finn Garner,Anchor
4,277427,0385424736,9,"gilbert, arizona, usa",48.0,The Rainmaker,John Grisham,1995.0,Doubleday Books,http://images.amazon.com/images/P/0385424736.0...,36,James Finn Garner,Anchor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11890,276680,0375727132,8,"hopewell junction, new york, usa",55.0,The Dive From Clausen's Pier : A Novel (Vintag...,ANN PACKER,2003.0,Vintage,http://images.amazon.com/images/P/0375727132.0...,13,Jeffrey Eugenides,Ballantine Books
11891,276680,0375727345,8,"hopewell junction, new york, usa",55.0,House of Sand and Fog,Andre Dubus III,2000.0,Vintage Books,http://images.amazon.com/images/P/0375727345.0...,42,Jeffrey Eugenides,Ballantine Books
11892,276680,0385504209,8,"hopewell junction, new york, usa",55.0,The Da Vinci Code,Dan Brown,2003.0,Doubleday,http://images.amazon.com/images/P/0385504209.0...,81,Jeffrey Eugenides,Ballantine Books
11893,276680,0440221595,8,"hopewell junction, new york, usa",55.0,The Glass Lake,Maeve Binchy,1996.0,Dell,http://images.amazon.com/images/P/0440221595.0...,11,Jeffrey Eugenides,Ballantine Books


In [11]:
del book_info['num_of_rating'] # df_plus contain num_of_raging column so no need to merge it

In [12]:
df_plus=df_plus.merge(book_info,on='isbn',how='left')
df_plus

Unnamed: 0,user_id,isbn,book_rating,location,user_age,title,author,year,publisher,img_url,num_of_rating,fav_author,fav_publisher,avg_book_rate,weighted_score
0,277427,002542730X,10,"gilbert, arizona, usa",48.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994.0,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,15,James Finn Garner,Anchor,7.866667,7.992896
1,277427,0061009059,9,"gilbert, arizona, usa",48.0,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995.0,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,26,James Finn Garner,Anchor,8.115385,8.123105
2,277427,0316776963,8,"gilbert, arizona, usa",48.0,Me Talk Pretty One Day,David Sedaris,2001.0,Back Bay Books,http://images.amazon.com/images/P/0316776963.0...,42,James Finn Garner,Anchor,8.032258,8.057381
3,277427,0345413903,10,"gilbert, arizona, usa",48.0,The Murder Book,Jonathan Kellerman,2003.0,Ballantine Books,http://images.amazon.com/images/P/0345413903.0...,16,James Finn Garner,Anchor,8.000000,8.062107
4,277427,0385424736,9,"gilbert, arizona, usa",48.0,The Rainmaker,John Grisham,1995.0,Doubleday Books,http://images.amazon.com/images/P/0385424736.0...,36,James Finn Garner,Anchor,8.000000,8.036757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11890,276680,0375727132,8,"hopewell junction, new york, usa",55.0,The Dive From Clausen's Pier : A Novel (Vintag...,ANN PACKER,2003.0,Vintage,http://images.amazon.com/images/P/0375727132.0...,13,Jeffrey Eugenides,Ballantine Books,7.769231,7.953888
11891,276680,0375727345,8,"hopewell junction, new york, usa",55.0,House of Sand and Fog,Andre Dubus III,2000.0,Vintage Books,http://images.amazon.com/images/P/0375727345.0...,42,Jeffrey Eugenides,Ballantine Books,7.595238,7.723656
11892,276680,0385504209,8,"hopewell junction, new york, usa",55.0,The Da Vinci Code,Dan Brown,2003.0,Doubleday,http://images.amazon.com/images/P/0385504209.0...,81,Jeffrey Eugenides,Ballantine Books,8.666667,8.593629
11893,276680,0440221595,8,"hopewell junction, new york, usa",55.0,The Glass Lake,Maeve Binchy,1996.0,Dell,http://images.amazon.com/images/P/0440221595.0...,11,Jeffrey Eugenides,Ballantine Books,8.181818,8.158379


In [13]:
df_plus.to_csv('../data/cleaned_book_ratings_plus.csv')

### 🛠 Feature Engineering: New Columns Added

After feature engineering, the following new features have been added to the dataset:

1. **fav_author**  
   - The author that the user most prefers.  
   - Calculated based on the author for whom the user gave the highest average rating.

2. **fav_publisher**  
   - The publisher that the user most prefers.  
   - Calculated as the publisher from which the user has rated the most books (highest count).

3. **avg_book_rate**  
   - The average rating for each book across all users.  
   - Provides a measure of overall book popularity/quality.

4. **weighted_score**  
   - An IMDb-style weighted rating for each book.  
   - Balances a book’s average rating with the global mean to prevent books with very few ratings from dominating the ranking.


# Converting to Pivot table

## Building User Preference Vectors

### Why?
* To use Collaborative Filtering, we need to represent each user’s preferences in a numerical form. This allows us to measure similarity between users (user-based CF) or between books (item-based CF).

### How?

* Construct a matrix User × Book.

* Rows = users, Columns = books (ISBN or Title).

* Values = ratings (or 0 if the user has not rated the book).

### Importance:

* These vectors allow us to calculate user similarity (e.g., cosine similarity).

* Enable recommendation: if two users are similar, recommend to one the books rated highly by the other.

In [14]:
user_item_matrix =  df_plus.pivot_table(
    index= "user_id",
    columns="title",
    values="book_rating",
    fill_value=0
)

In [15]:
user_item_matrix

title,1984,1st to Die: A Novel,2010: Odyssey Two,2nd Chance,4 Blondes,84 Charing Cross Road,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Confederacy of Dunces (Evergreen Book),...,White Oleander : A Novel (Oprah's Book Club),White Teeth: A Novel,Wicked: The Life and Times of the Wicked Witch of the West,Wifey,Wild Animus,"Wizard and Glass (The Dark Tower, Book 4)","Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players",Year of Wonders,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,"\O\"" Is for Outlaw"""
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
4017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
276165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
user_item_matrix.shape

(602, 585)

🔹 Sparse Matrix

* In the User–Item Matrix, most values are 0 because users rate only a few books.

* A dense matrix would be huge and waste memory.

* A sparse matrix stores only the non–zero values.

* It reduces memory usage and speeds up calculations (e.g., cosine similarity, matrix factorization).

* This makes collaborative filtering possible on large datasets.

In [None]:

sparse_user_item = csr_matrix(user_item_matrix.values)

In [18]:
sparse_user_item

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11895 stored elements and shape (602, 585)>