In [3]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [4]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2022-10-01 08:24:29--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2022-10-01 08:24:29 (141 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [5]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [6]:
print(df_books.shape)
df_books.head()

(271379, 3)


Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [7]:
# Check missing values and delete if any
df_books.isnull().sum()

isbn      0
title     0
author    1
dtype: int64

In [8]:
df_books.dropna(inplace=True)
df_books.isnull().sum()

isbn      0
title     0
author    0
dtype: int64

In [9]:
print(df_ratings.shape)
df_ratings.head()

(1149780, 3)


Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [10]:
# Check missing values and delete if any
df_ratings.isnull().sum()

user      0
isbn      0
rating    0
dtype: int64

In [11]:
# check the unique value counts
vcr = df_ratings['user'].value_counts().sort_values(ascending=True)
vcr

256099        1
46152         1
46155         1
46156         1
46157         1
          ...  
35859      5850
98391      5891
153662     6109
198711     7550
11676     13602
Name: user, Length: 105283, dtype: int64

In [89]:
# Remove users less than 200 ratings
len(vcr[vcr < 200])

104378

In [123]:
user_b200 = vcr[vcr < 200]
df_ratings_b200 = df_ratings[df_ratings.user.isin(user_b200.index)]
df_ratings_b200

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
...,...,...,...
1149775,276704,1563526298,9.0
1149776,276706,0679447156,0.0
1149777,276709,0515107662,10.0
1149778,276721,0590442449,10.0


In [124]:
df_ratings_new = df_ratings[(~df_ratings.user.isin(df_ratings_b200.user))]
df_ratings_new.shape

(527556, 3)

In [125]:
# check the unique value counts
vcb = df_ratings['isbn'].value_counts().sort_values(ascending=True)
vcb

0738702862       1
0307039331       1
8838463026       1
0064432521       1
1568381190       1
              ... 
0312195516     723
0060928336     732
0385504209     883
0316666343    1295
0971880107    2502
Name: isbn, Length: 340556, dtype: int64

In [126]:
# Remove books less than 100 ratings
len(vcb[vcb < 100]) 

339825

In [130]:
isbn_b100 = vcb[vcb < 100]
df_ratings_b100 = df_books[df_books["isbn"].isin(isbn_b100.index)]
df_ratings_b100

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [131]:
df_ratings_new = df_ratings_new[(~df_ratings_new.isbn.isin(df_ratings_b100.isbn))]
df_ratings_new.shape

(88317, 3)

Dataset for KNN

In [132]:
# We need a single dataframe with book name and ratings by user 
# so combine the dataframe and delete unnecessary columns
df_combine = pd.merge(df_ratings_new, df_books, on = 'isbn')
df_combine.drop(['isbn','author'], axis=1, inplace = True)
df_combine.head()

Unnamed: 0,user,rating,title
0,277427,10.0,Politically Correct Bedtime Stories: Modern Ta...
1,3363,0.0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,6.0,Politically Correct Bedtime Stories: Modern Ta...
3,12538,10.0,Politically Correct Bedtime Stories: Modern Ta...
4,13552,0.0,Politically Correct Bedtime Stories: Modern Ta...


In [133]:
# df_combine.drop_duplicates(['user','title'], inplace = True)
df_pivot = df_combine.pivot_table(index = 'title', columns = 'user', values = 'rating').fillna(0)
df_pivot.sort_index().head()

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
# Build the knn model
knn_model = NearestNeighbors(metric = 'cosine')
knn_model.fit(df_pivot.values)

NearestNeighbors(metric='cosine')

In [140]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  try:
      bookis = df_pivot.loc[book]
  except KeyError as e:
      print('The given book', e, 'does not exist')
      return

  dis, ind = knn_model.kneighbors([bookis.values], n_neighbors=6)

  recommended_books = pd.DataFrame({
      'title'   : df_pivot.iloc[ind[0]].index.values,
      'distance': dis[0]
    })
  recommended_books = recommended_books[recommended_books["title"] != book].sort_values(by='distance', ascending=False)

  return [book, recommended_books.head().values]

In [136]:
get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 array([['The Vampire Lestat (Vampire Chronicles, Book II)',
         0.5178411602973938],
        ['The Tale of the Body Thief (Vampire Chronicles (Paperback))',
         0.5376338362693787],
        ['Interview with the Vampire', 0.7345068454742432],
        ['The Witching Hour (Lives of the Mayfair Witches)',
         0.7448656558990479],
        ['Catch 22', 0.793983519077301]], dtype=object)]

In [141]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
      print('test_pass3', test_pass)
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
      print('test_pass4',test_pass)
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation() 

["Where the Heart Is (Oprah's Book Club (Paperback))", array([["I'll Be Seeing You", 0.8016210794448853],
       ['The Weight of Water', 0.7708583474159241],
       ['The Surgeon', 0.7699410915374756],
       ['I Know This Much Is True', 0.7677075266838074],
       ['The Lovely Bones: A Novel', 0.7234864234924316]], dtype=object)]
You passed the challenge! 🎉🎉🎉🎉🎉
