In [7]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
# Download dataset
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
!unzip book-crossings.zip

# Load dataset
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-03-26 18:30:55--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.1’


2025-03-26 18:30:55 (184 MB/s) - ‘book-crossings.zip.1’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


In [4]:
# Data Cleaning: Remove books with <100 ratings and users with <200 ratings
ratings = df_ratings[df_ratings.isbn.isin(df_books.isbn)]

book_count = ratings.groupby('isbn')['rating'].count()
ratings = ratings[ratings.isbn.isin(book_count[book_count >= 100].index)]

user_count = ratings.groupby('user')['rating'].count()
ratings = ratings[ratings.user.isin(user_count[user_count >= 200].index)]
# Create book-user matrix
book_rating = ratings.pivot(index='isbn', columns='user', values='rating').fillna(0)
book_rating_matrix = csr_matrix(book_rating.values)

In [5]:
# Train KNN Model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_rating_matrix)

# Function to return recommended books - this will be tested
def get_recommends(book=""):
    # Find the ISBN of the given book title
    book_row = df_books[df_books['title'] == book]
    if book_row.empty:
        print("The book is not in the database.")
        return

    book_isbn = book_row.iloc[0]['isbn']

    # Ensure the book exists in the pivot table
    if book_isbn not in book_rating.index:
        print("The book does not have enough ratings.")
        return

    book_idx = book_rating.index.get_loc(book_isbn)

    # Find similar books using KNN
    distances, indices = model.kneighbors(book_rating.iloc[book_idx, :].values.reshape(1, -1), n_neighbors=6)

    recommended_books = [book, []]
    for i in range(1, len(distances.flatten())):
        rec_isbn = book_rating.index[indices.flatten()[i]]
        rec_title = df_books[df_books.isbn == rec_isbn]['title'].values[0]
        recommended_books[1].append([rec_title, distances.flatten()[i]])

    return recommended_books

# Test the function
print(get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))"))
