### Imports

In [54]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-04-21 11:05:11--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2024-04-21 11:05:12 (272 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


# Book Recommendation Engine using KNN

In this notebook, we will use a KNN model to create recommendations based on the given title by using the [Book-Crossing Dataset](https://cdn.freecodecamp.org/project-data/books/book-crossings.zip). Let's start by reading and taking a look into the data.

In [19]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [20]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [21]:
df_ratings

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
...,...,...,...
1149775,276704,1563526298,9.0
1149776,276706,0679447156,0.0
1149777,276709,0515107662,10.0
1149778,276721,0590442449,10.0


As described in the problem, we need to remove users who have less than $200$ ratings, and books with less than $100$ ratings.

In [22]:
df_ratings['isbn'].value_counts()

isbn
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
              ... 
0828903174       1
0553571001       1
0689822294       1
3257062273       1
325723161X       1
Name: count, Length: 340556, dtype: int64

In [23]:
isbn_counts = df_ratings['isbn'].value_counts()
isbn_to_keep = isbn_counts[isbn_counts >= 100].index
df_ratings = df_ratings[df_ratings['isbn'].isin(isbn_to_keep)]

In [24]:
df_ratings['isbn'].value_counts()

isbn
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
              ... 
0060916508     100
0425136981     100
0312966806     100
0385424736     100
0786866586     100
Name: count, Length: 731, dtype: int64

Let's now remove users with less than 5 submitted ratings

In [25]:
user_counts = df_ratings['user'].value_counts()
users_to_keep = user_counts[user_counts >= 5].index
df_ratings = df_ratings[df_ratings['user'].isin(users_to_keep)]

In [26]:
df_ratings['user'].value_counts()

user
11676     660
35859     348
16795     297
76352     291
23768     259
         ... 
276071      5
81333       5
81661       5
82720       5
82917       5
Name: count, Length: 5164, dtype: int64

As we can see, the ratings dataframe contains user ratings for each book. Let's create another dataframe to summarize all data together

In [27]:
# Merge dataframes on isbn
merged_df = pd.merge(df_books, df_ratings, on='isbn')

# Calculate sum of ratings for each book
mean_df = merged_df.groupby(['isbn', 'title', 'author'])['rating'].sum().reset_index()

In [28]:
# rating is at this time the sum of all ratings given by users, we will fix this
mean_df

Unnamed: 0,isbn,title,author,rating
0,002542730X,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,362.0
1,0060008032,Angels,Marian Keyes,185.0
2,0060096195,The Boy Next Door,Meggin Cabot,272.0
3,006016848X,"Men Are from Mars, Women Are from Venus: A Pra...",John Gray,260.0
4,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,253.0
...,...,...,...,...
722,1573227331,About a Boy,Nick Hornby,195.0
723,1573229326,How to Be Good,Nick Hornby,371.0
724,1573229571,About a Boy (Movie Tie-In),Nick Hornby,272.0
725,1592400876,"Eats, Shoots &amp; Leaves: The Zero Tolerance ...",Lynne Truss,374.0


In [29]:
counts_of_each_isbn = df_ratings['isbn'].value_counts()

def row_mean(row):
    isbn = row['isbn']
    sum_of_ratings = row['rating']
    isbn_counts = counts_of_each_isbn[isbn]
    new_rating = sum_of_ratings / isbn_counts
    return new_rating

# Apply the custom function to each row of the mean_df dataframe
mean_df['rating'] = mean_df.apply(row_mean, axis=1)

In [31]:
df = mean_df

In [32]:
df.describe()

Unnamed: 0,rating
count,727.0
mean,2.736318
std,0.77897
min,0.628178
25%,2.187683
50%,2.648
75%,3.213317
max,6.605634


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 727 entries, 0 to 726
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   isbn    727 non-null    object 
 1   title   727 non-null    object 
 2   author  727 non-null    object 
 3   rating  727 non-null    float64
dtypes: float64(1), object(3)
memory usage: 22.8+ KB


Now that we have our new dataset clean, we have to create a KNN model which will return us similar books based in ratings.

In [78]:
X = np.array(df['rating']).reshape(-1, 1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the model
k = 6  # Number of neighbors to consider
model = NearestNeighbors(n_neighbors=k)
model.fit(X_scaled)

In [41]:
df

Unnamed: 0,isbn,title,author,rating
0,002542730X,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,2.806202
1,0060008032,Angels,Marian Keyes,2.151163
2,0060096195,The Boy Next Door,Meggin Cabot,3.532468
3,006016848X,"Men Are from Mars, Women Are from Venus: A Pra...",John Gray,2.342342
4,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,3.048193
...,...,...,...,...
722,1573227331,About a Boy,Nick Hornby,3.095238
723,1573229326,How to Be Good,Nick Hornby,2.748148
724,1573229571,About a Boy (Movie Tie-In),Nick Hornby,3.626667
725,1592400876,"Eats, Shoots &amp; Leaves: The Zero Tolerance ...",Lynne Truss,3.936842


In [82]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  book_rating = df[df['title'] == book].iloc[0]['rating']
  # Standardize the input rating
  rating_scaled = scaler.transform([[book_rating]])
  # Find k nearest neighbors
  distances, indices = model.kneighbors(rating_scaled)

  # Eliminate first neighbor, which is the current book
  distances = distances[0][1:]
  indices = indices[0][1:]

  similar_books = df.iloc[indices[0]]
  recommended_books = [book, []]
  for i, (index, pair_book), distance in zip(range(len(indices)), df.iloc[indices].iterrows(), distances):
    recommended_books[1].append([pair_book['title'], 1 - distance])
  return recommended_books

In [85]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ['The Last Time They Met : A Novel', 'Angus, Thongs and Full-Frontal Snogging: Confessions of Georgia Nicolson', "Suzanne's Diary for Nicholas", 'The Virgin Suicides']
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Last Time They Met : A Novel', 0.9981739431825407], ['Angus, Thongs and Full-Frontal Snogging: Confessions of Georgia Nicolson', 0.9944388269650103], ["Suzanne's Diary for Nicholas", 0.9938052756065936], ['The Virgin Suicides', 0.9916582404475148], ['The Notebook', 0.9840418512908977]]]
You passed the challenge! 🎉🎉🎉🎉🎉
