### Imports

In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-04-05 18:26:26--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2024-04-05 18:26:26 (190 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


# Book Recommendation Engine using KNN

In this notebook, we will use a KNN model to create recommendations based on the given title by using the [Book-Crossing Dataset](https://cdn.freecodecamp.org/project-data/books/book-crossings.zip). Let's start by reading and taking a look into the data.

In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [5]:
df_ratings

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
...,...,...,...
1149775,276704,1563526298,9.0
1149776,276706,0679447156,0.0
1149777,276709,0515107662,10.0
1149778,276721,0590442449,10.0


As described in the problem, we need to remove users who have less than $200$ ratings, and books with less than $100$ ratings.

In [18]:
df_ratings['isbn'].value_counts()

isbn
0971880107    365
0316666343    272
0060928336    221
0440214041    218
0385504209    217
             ... 
0441011993      1
0441011985      1
0441011969      1
0441011934      1
9626344990      1
Name: count, Length: 207699, dtype: int64

In [22]:
isbn_counts = df_ratings['isbn'].value_counts()
isbn_to_keep = isbn_counts[isbn_counts >= 100].index
df_books = df_books[df_books['isbn'].isin(isbn_to_keep)]

In [23]:
df_books

Unnamed: 0,isbn,title,author
18,0440234743,The Testament,John Grisham
26,0971880107,Wild Animus,Rich Shapero
37,0446310786,To Kill a Mockingbird,Harper Lee
52,0440225701,The Street Lawyer,JOHN GRISHAM
67,0804106304,The Joy Luck Club,Amy Tan
...,...,...,...
5887,0671001795,Two for the Dough,Janet Evanovich
6196,0312983271,Full House (Janet Evanovich's Full Series),Janet Evanovich
7500,0440222656,The Horse Whisperer,Nicholas Evans
7852,0553280341,B Is for Burglar (Kinsey Millhone Mysteries (P...,Sue Grafton


In [10]:
user_counts = df_ratings['user'].value_counts()
users_to_keep = user_counts[user_counts >= 200].index
df_ratings = df_ratings[df_ratings['user'].isin(users_to_keep)]

In [15]:
df_ratings

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0
...,...,...,...
1147612,275970,3829021860,0.0
1147613,275970,4770019572,0.0
1147614,275970,896086097,0.0
1147615,275970,9626340762,8.0


As we can see, the ratings dataframe contains user ratings for each book. Let's create another dataframe to summarize all data together

In [25]:
# Merge dataframes on isbn
merged_df = pd.merge(df_books, df_ratings, on='isbn')

# Calculate sum of ratings for each book
mean_df = merged_df.groupby(['isbn', 'title', 'author'])['rating'].sum().reset_index()

In [26]:
mean_df

Unnamed: 0,isbn,title,author,rating
0,0060502258,The Divine Secrets of the Ya-Ya Sisterhood: A ...,Rebecca Wells,302.0
1,0060928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,422.0
2,0060930535,The Poisonwood Bible: A Novel,Barbara Kingsolver,187.0
3,0060934417,Bel Canto: A Novel,Ann Patchett,300.0
4,0060976845,Little Altars Everywhere: A Novel,Rebecca Wells,282.0
...,...,...,...,...
94,080410526X,All I Really Need to Know,ROBERT FULGHUM,216.0
95,0804106304,The Joy Luck Club,Amy Tan,318.0
96,080410753X,The Kitchen God's Wife,Amy Tan,158.0
97,0971880107,Wild Animus,Rich Shapero,159.0


In [27]:
counts_of_each_isbn = df_ratings['isbn'].value_counts()

def row_mean(row):
    isbn = row['isbn']
    mean_rating = row['rating']
    isbn_counts = counts_of_each_isbn[isbn]
    new_rating = mean_rating / isbn_counts
    return new_rating

# Apply the custom function to each row of the mean_df dataframe
mean_df['rating'] = mean_df.apply(row_mean, axis=1)

In [30]:
mean_df.describe()

Unnamed: 0,rating
count,99.0
mean,2.100619
std,0.651646
min,0.435616
25%,1.671657
50%,1.947368
75%,2.484047
max,4.262136


In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):


  return recommended_books

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()