In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data inporting and merging
books  = pd.read_csv('../data/BX-Books.csv', sep=";", on_bad_lines='skip', encoding='latin-1')
ratings = pd.read_csv('../data/BX-Book-Ratings.csv', sep=";", on_bad_lines='skip', encoding='latin-1')
users  = pd.read_csv("../data/BX-Users.csv", sep=";", on_bad_lines='skip', encoding='latin-1')

In [3]:
print(f"books shape: {books.shape}")
print(f"ratings shape: {ratings.shape}")
print(f"users shape: {users.shape}")

books shape: (271360, 8)
ratings shape: (1149780, 3)
users shape: (278858, 3)


In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
# drop unnecessary columns and rename columns
books = books.drop(columns=['Image-URL-S','Image-URL-L'])
books = books.rename(columns={'Book-Title': 'title', 'Book-Author': 'author','ISBN': 'isbn',
                            'Year-Of-Publication': 'year', 'Publisher': 'publisher',"Image-URL-M":"img_url"})

In [6]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
# drop unnecessary columns and rename columns
ratings = ratings.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'book_rating', 'ISBN': 'isbn'})

In [8]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [9]:
users = users.rename(columns={'User-ID': 'user_id', 'Location': 'location', 'Age': 'user_age'})

# 📊 Data Validation in Book Recommendation System

## Tables
- **BX-Books**: book metadata (`ISBN`, `Title`, `Author`, …)  
- **BX-Users**: user metadata (`User-ID`, `Location`, `Age`)  
- **BX-Ratings**: ratings (`User-ID`, `ISBN`, `Book-Rating`)  

## Relationships
- `BX-Ratings` links **users** and **books**.  
- One user → many books.  
- One book → many users.  
- Users and books are unique in their own tables.  

## Validation Steps
1. **ID consistency**: all `User-ID` and `ISBN` in ratings must exist in users/books.  
2. **Duplicates**: remove duplicate `(User-ID, ISBN)` pairs.  
3. **Ratings check**: decide how to handle `0` (implicit feedback).  
4. **Activity filter**: drop users/books with very few ratings.  
5. **Row counts**: verify rows before and after merges.  

## Cases
- ✅ Many users rate the same book → expected.  
- ✅ Many books rated by one user → expected.  
- ❌ Same user rating the same book multiple times.  
- ❌ Ratings referencing missing users or books.  


## Consistancy Check

In [10]:
# Keep only ratings where user exists in BX-Users
ratings = ratings[ratings["user_id"].isin(users["user_id"])]
ratings.shape
# no ratings were removed 

(1149780, 3)

In [11]:
# Keep only ratings where book exists in BX-Books
ratings = ratings[ratings["isbn"].isin(books["isbn"])]
ratings.shape
# there were 118,644 ratings removed 

(1031136, 3)

There were 118,644 rating removed due to missing book.

## Remove Duplicated

In [12]:
ratings = ratings.drop_duplicates(subset=["user_id", "isbn"])
ratings.shape


(1031136, 3)

## Tabels Merging

In [13]:
# Merge safely with user and book metadata
df = ratings.merge(users, on="user_id", how="inner").merge(books, on="isbn", how="inner")

print("Final merged dataset size:", df.shape)
print("\n \n Sample book ratings:\n", df[df["isbn"] == df["isbn"].iloc[0]][["isbn", "book_rating"]].head())


Final merged dataset size: (1031136, 10)

 
 Sample book ratings:
              isbn  book_rating
0      034545104X            0
13561  034545104X            5
23643  034545104X            0
31455  034545104X            5
35641  034545104X            9


In [14]:
df.head()

Unnamed: 0,user_id,isbn,book_rating,location,user_age,title,author,year,publisher,img_url
0,276725,034545104X,0,"tyler, texas, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,"seattle, washington, usa",,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,"h, new south wales, australia",16.0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,"rijeka, n/a, croatia",16.0,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,"rijeka, n/a, croatia",16.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...


## Reduce dataset size

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031136 entries, 0 to 1031135
Data columns (total 10 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1031136 non-null  int64  
 1   isbn         1031136 non-null  object 
 2   book_rating  1031136 non-null  int64  
 3   location     1031136 non-null  object 
 4   user_age     753301 non-null   float64
 5   title        1031136 non-null  object 
 6   author       1031134 non-null  object 
 7   year         1031136 non-null  object 
 8   publisher    1031134 non-null  object 
 9   img_url      1031136 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 78.7+ MB


In [16]:
# Downcast int64 → int32 and float64 → float32
df["user_id"] = pd.to_numeric(df["user_id"], downcast="integer")
df["book_rating"] = pd.to_numeric(df["book_rating"], downcast="integer")
df["user_age"] = pd.to_numeric(df["user_age"], downcast="float")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031136 entries, 0 to 1031135
Data columns (total 10 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1031136 non-null  int32  
 1   isbn         1031136 non-null  object 
 2   book_rating  1031136 non-null  int8   
 3   location     1031136 non-null  object 
 4   user_age     753301 non-null   float32
 5   title        1031136 non-null  object 
 6   author       1031134 non-null  object 
 7   year         1031136 non-null  object 
 8   publisher    1031134 non-null  object 
 9   img_url      1031136 non-null  object 
dtypes: float32(1), int32(1), int8(1), object(7)
memory usage: 63.9+ MB


# Dataset Filtering

# 📊 Data Filtering Criteria for Book Recommendation System

## Goal
The purpose of filtering is to reduce noise, handle invalid entries, and ensure that both users and books have enough interactions for meaningful recommendations. This improves data quality, model performance, and computational efficiency.

## Filtering Criteria

1. **Explicit Ratings Only**  
   - Remove ratings equal to `0` (implicit feedback).  
   - Keep only ratings greater than `0`.  
2. **Age Validation**  
   - Keep user ages within the range **10–100**.  
   - Removes unrealistic or missing values.  
3. **Year Validation**  
   - Keep publication years between **1500 and the current year**.  
   - Removes invalid or placeholder values.  
4. **Minimum Ratings per User**  
   - Keep only users with at least **2 ratings**.  
   - Ensures each user profile is informative.  
5. **Minimum Ratings per Book**  
   - Keep only books with at least **3 ratings**.  
   - Ensures books have enough feedback for collaborative filtering.  





## Outcome
- A cleaner, denser dataset.  
- Balanced representation of users and books.  
- Reduced sparsity and improved model training.  


In [17]:
# -----------------------------
# 1. Explicit ratings only
# -----------------------------
df = df[df["book_rating"] > 0]

In [18]:
df['user_age'].describe()

count    269621.000000
mean         36.835831
std          13.753045
min           0.000000
25%          28.000000
50%          35.000000
75%          45.000000
max         244.000000
Name: user_age, dtype: float64

In [19]:
# -----------------------------
# 2. Age validation
# -----------------------------
df = df[(df['user_age'] >= 10) & (df['user_age'] <= 100)]


In [20]:
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df['year'].describe()

count    267239.000000
mean       1965.349739
std         244.586773
min           0.000000
25%        1992.000000
50%        1998.000000
75%        2001.000000
max        2050.000000
Name: year, dtype: float64

In [21]:
# -----------------------------
# 3. Year validation
# -----------------------------
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df = df[((df["year"] >= 1500) & (df["year"] <= 2025))]


### Iterative Filtering

When filtering our dataset, we face the classic **chicken-or-egg problem** 🐔🥚:  

- If we filter users first, later removing books may drop their ratings.  
- If we filter books first, later removing users may also drop ratings.  

👉 To solve this, we use **iterative filtering**:  
1. Filter users with fewer than 2 ratings.  
2. Filter books with fewer than 3 ratings.  
3. Repeat until no more users or books are removed.  

This looping ensures that in the final dataset:  
- Every user has enough ratings.  
- Every book has enough ratings.  

No matter who came first, chicken or egg, both survive in balance 🐓📚.


In [22]:
current_shape  = df.shape[0]
new_shape = 0
while current_shape != new_shape:
    # -----------------------------
    # 4. Minimum ratings per user
    # -----------------------------
    user_counts = df["user_id"].value_counts()
    df = df[df["user_id"].isin(user_counts[user_counts > 10].index)]
    current_shape = df.shape[0]
    # -----------------------------
    # 5. Minimum ratings per book
    # -----------------------------
    book_counts = df["isbn"].value_counts()
    df = df[df["isbn"].isin(book_counts[book_counts > 10].index)]
    new_shape = df.shape[0]
    print(f"Current shape: {current_shape}, New shape: {new_shape}")

Current shape: 185797, New shape: 39101
Current shape: 26406, New shape: 20209
Current shape: 17547, New shape: 15652
Current shape: 14524, New shape: 13749
Current shape: 13298, New shape: 12885
Current shape: 12559, New shape: 12373
Current shape: 12254, New shape: 12154
Current shape: 12064, New shape: 12004
Current shape: 11974, New shape: 11944
Current shape: 11924, New shape: 11924


### Adding number of ratings feature

In [23]:
num_rating = df.groupby('title')['book_rating'].count().reset_index()
num_rating.rename(columns= {
    "book_rating":"num_of_rating"},inplace=True)
num_rating

Unnamed: 0,title,num_of_rating
0,1984,29
1,1st to Die: A Novel,42
2,2010: Odyssey Two,11
3,2nd Chance,37
4,4 Blondes,11
...,...,...
580,"Wizard and Glass (The Dark Tower, Book 4)",15
581,"Word Freak: Heartbreak, Triumph, Genius, and O...",18
582,Year of Wonders,16
583,Zen and the Art of Motorcycle Maintenance: An ...,13


In [24]:
df = df.merge(num_rating , on = "title",how='inner') # many to many relationship

## Data Saving

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11924 entries, 0 to 11923
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        11924 non-null  int32  
 1   isbn           11924 non-null  object 
 2   book_rating    11924 non-null  int8   
 3   location       11924 non-null  object 
 4   user_age       11924 non-null  float32
 5   title          11924 non-null  object 
 6   author         11924 non-null  object 
 7   year           11924 non-null  float64
 8   publisher      11924 non-null  object 
 9   img_url        11924 non-null  object 
 10  num_of_rating  11924 non-null  int64  
dtypes: float32(1), float64(1), int32(1), int64(1), int8(1), object(6)
memory usage: 850.2+ KB


In [26]:
df.drop_duplicates(['user_id','title'],inplace=True)
df.shape

(11895, 11)

In [27]:
df.to_csv("../data/cleaned_book_ratings.csv", index=False)