In [1]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt


In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'


--2023-12-08 06:38:47--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2023-12-08 06:38:48 (189 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


In [4]:
print(df_books.head(2))
print(df_ratings.head(2))


         isbn                title                author
0  0195153448  Classical Mythology    Mark P. O. Morford
1  0002005018         Clara Callan  Richard Bruce Wright
     user        isbn  rating
0  276725  034545104X     0.0
1  276726  0155061224     5.0


In [5]:
# filter by rates count
df_books_copy = df_ratings.groupby(["isbn"]).count().reset_index()
highly_rated_books = df_books_copy.loc[df_books_copy["rating"] >= 100]["isbn"]

highly_rated_books = df_books.loc[df_books["isbn"].isin(highly_rated_books)]
print(highly_rated_books.head(2))


          isbn                                 title         author
18  0440234743                         The Testament   John Grisham
19  0452264464  Beloved (Plume Contemporary Fiction)  Toni Morrison


In [6]:
# filter by rates from highly active users
df_users_copy = df_ratings[["user", "rating"]].groupby(["user"]).count().reset_index()

highly_active_users = df_users_copy.loc[df_users_copy["rating"] >= 200]["user"]

df_filtered = df_ratings.loc[df_ratings["user"].isin(highly_active_users)]
df_filtered = df_filtered.loc[df_filtered["isbn"].isin(highly_rated_books["isbn"])]
print(df_filtered.head(2))


        user        isbn  rating
1456  277427  002542730X    10.0
1469  277427  0060930535     0.0


In [7]:
# convert df_filtered to scipy sparse matrix
df_book_features = df_filtered.pivot(
    index='isbn',
    columns='user',
    values='rating'
).fillna(0)

matrix_book_features = csr_matrix(df_book_features.values)


In [8]:
print(df_book_features.head(2))


user        254     2276    2766    2977    3363    4017    4385    6242    \
isbn                                                                         
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

user        6251    6323    ...  274004  274061  274301  274308  274808  \
isbn                        ...                                           
002542730X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   

user        275970  277427  277478  277639  278418  
isbn                                                
002542730X     0.0    10.0     0.0     0.0     0.0  
0060008032     0.0     0.0     0.0     0.0     0.0  

[2 rows x 888 columns]


In [9]:
# build the model
model = NearestNeighbors(metric='cosine')
model.fit(matrix_book_features)


In [10]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
    try:
        title = highly_rated_books.loc[highly_rated_books["title"] == book]
    except KeyError as e:
        return

    bk = df_book_features.loc[df_book_features.index.isin(title["isbn"])]
    query_point = [i for i in bk.values]
    k = 6
    distance, index = model.kneighbors(query_point, n_neighbors = k)

    distance =  distance[0][1:]
    index = index[0][1:]

    books = [
        df_books.loc[df_books['isbn'] == df_book_features.iloc[i].name]["title"].values[0]
        for i in index
    ]

    recommended = [list(i) for i in zip(books, distance)][::-1]
    return [book, recommended]

get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")


['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['Catch 22', 0.7939835],
  ['The Witching Hour (Lives of the Mayfair Witches)', 0.74486566],
  ['Interview with the Vampire', 0.73450685],
  ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.53763384],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.51784116]]]

In [11]:
# test code
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()


["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.7677075], ['The Lovely Bones: A Novel', 0.7234864]]]
You passed the challenge! 🎉🎉🎉🎉🎉
