# Collaborative Filtering with Book-Crossing Dataset 

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data Preperation
## Load the Book-Crossing Dataset

In [9]:
url = "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"  # Replace with Book-Crossing ratings link if needed
data = pd.read_csv(url)

print("Dataset Sample:\n", data.head())
print("\nDataset Shape:", data.shape)

Dataset Sample:
    user_id  book_id  rating
0        1      258       5
1        2     4081       4
2        2      260       5
3        2     9296       5
4        2     2318       3

Dataset Shape: (5976479, 3)


In [10]:
 ## Preprocess the Dataset

In [11]:
# Drop unnecessary columns (if present)
print("\nUnique Users:", data['user_id'].nunique())
print("Unique Books:", data['book_id'].nunique())

# Filter out users with fewer than 5 ratings and books with fewer than 5 ratings
user_ratings = data.groupby("user_id").size()
data = data[data["user_id"].isin(user_ratings[user_ratings >= 5].index)]

book_ratings = data.groupby("book_id").size()
data = data[data["book_id"].isin(book_ratings[book_ratings >= 5].index)]

# Create a user-item matrix
user_item_matrix = data.pivot(index='user_id', columns='book_id', values='rating').fillna(0)

print("\nUser-Item Matrix Shape:", user_item_matrix.shape)


Unique Users: 53424
Unique Books: 10000

User-Item Matrix Shape: (53424, 10000)


In [12]:
## Build the Collaborative Filtering Model

In [None]:
# Compute user similarity
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print("\nUser Similarity Matrix Sample:\n", user_similarity_df.head())