In [366]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [367]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [368]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [369]:
# there are some books with more than one isbn.
# so we will use this 'title' column to identify the books
df = df_ratings.copy()
df['title'] = df['isbn'].map(df_books.set_index('isbn')['title'])
df

Unnamed: 0,user,isbn,rating,title
0,276725,034545104X,0.0,Flesh Tones: A Novel
1,276726,0155061224,5.0,Rites of Passage
2,276727,0446520802,0.0,The Notebook
3,276729,052165615X,3.0,Help!: Level 1
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...
...,...,...,...,...
1149775,276704,1563526298,9.0,Get Clark Smart : The Ultimate Guide for the S...
1149776,276706,0679447156,0.0,Eight Weeks to Optimum Health: A Proven Progra...
1149777,276709,0515107662,10.0,The Sherbrooke Bride (Bride Trilogy (Paperback))
1149778,276721,0590442449,10.0,Fourth Grade Rats


In [370]:
user_count = df['user'].value_counts()
title_count = df['title'].value_counts()

df['user_cnt'] = df['user'].map(user_count)
df['title_cnt'] = df['title'].map(title_count)
df

Unnamed: 0,user,isbn,rating,title,user_cnt,title_cnt
0,276725,034545104X,0.0,Flesh Tones: A Novel,1,60.0
1,276726,0155061224,5.0,Rites of Passage,1,14.0
2,276727,0446520802,0.0,The Notebook,1,650.0
3,276729,052165615X,3.0,Help!: Level 1,2,1.0
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,2,1.0
...,...,...,...,...,...,...
1149775,276704,1563526298,9.0,Get Clark Smart : The Ultimate Guide for the S...,17,3.0
1149776,276706,0679447156,0.0,Eight Weeks to Optimum Health: A Proven Progra...,1,40.0
1149777,276709,0515107662,10.0,The Sherbrooke Bride (Bride Trilogy (Paperback)),1,44.0
1149778,276721,0590442449,10.0,Fourth Grade Rats,1,15.0


In [371]:
# there are some books with more than one review by the same user
# so we sort by rating and drop duplicates
# this way we will keep only the higest rating by the user for the book
df = (df
      .loc[df['user_cnt'] > 200]
      .loc[df['title_cnt'] > 100]
      .drop(columns=['user_cnt', 'title_cnt'])
      .sort_values(by=['rating'], ascending=False)
      )
df.drop_duplicates(subset=['user', 'title'], inplace=True)
df

Unnamed: 0,user,isbn,rating,title
1146737,275970,0385722206,10.0,Balzac and the Little Chinese Seamstress : A N...
1456,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
1146363,275970,0064400557,10.0,Charlotte's Web (Trophy Newbery)
125021,28634,0440241537,10.0,The King of Torts
125024,28634,044651652X,10.0,The Bridges of Madison County
...,...,...,...,...
560095,135045,0684848783,0.0,Tis : A Memoir
560094,135045,068484477X,0.0,STONES FROM THE RIVER
560087,135045,0679731725,0.0,The Remains of the Day (Vintage International)
560057,135045,0671042858,0.0,The Girl Who Loved Tom Gordon


In [372]:
pivot = df.pivot(index='title', columns='user', values='rating').fillna(0)
pivot

user,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [373]:
nbrs = NearestNeighbors(n_neighbors=6).fit(pivot)
dists = nbrs.kneighbors(pivot)
print('indexes: \n', dists[1], '\n\n', 'distances: \n', dists[0])

indexes: 
 [[  0 429 271 744 106  10]
 [  1 633 245 812 601 359]
 [  2 785 370 329 210 253]
 ...
 [901 263 174 329 210 429]
 [902 602 516 347 633 106]
 [903 225 271 253 358 397]] 

 distances: 
 [[ 0.         48.57983017 48.65182495 48.93873596 49.0917511  49.1528244 ]
 [ 0.         54.4701767  54.54356003 54.93632507 55.07267761 55.09083557]
 [ 0.         23.91652107 24.24871063 24.65765572 24.71841431 25.09980011]
 ...
 [ 0.         32.15586853 32.34192276 32.40370178 32.60367966 32.7414093 ]
 [ 0.         22.02271461 22.29349709 24.49489784 24.61706734 24.63736916]
 [ 0.         40.47221375 40.9389801  41.84495163 41.86884308 42.05948257]]


In [374]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  idx = list(pivot.index).index(book)
  rec = []
  for i in range(5, 1, -1):
      j = dists[1][idx][i]
      dist = dists[0][idx][i]
      title = pivot.index[j]
      rec.append([title, float(dist)])
  
  recommended_books = [book, rec]
  return recommended_books

print(get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))"))
print(get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))"))

['The Queen of the Damned (Vampire Chronicles (Paperback))', [['The Laws of Our Fathers', 36.810325622558594], ['Long After Midnight', 36.63331985473633], ['The Prometheus Deception', 36.53765106201172], ['Jacob Have I Loved', 35.944400787353516]]]
["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Weight of Water', 61.757591247558594], ['Jacob Have I Loved', 61.700889587402344], ['Gap Creek: The Story Of A Marriage', 61.67657470703125], ['Blue Diary', 61.652252197265625]]]


## Not quite equal to the challange... let's try other metrics:

In [375]:
metrics = ["cityblock", "cosine", "euclidean"]

for metric in metrics:
    print(f"\n metric: {metric}")
    nbrs = NearestNeighbors(n_neighbors=6, metric=metric ).fit(pivot)
    dists = nbrs.kneighbors(pivot)
    print(get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))"))
    print(get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))"))


 metric: cityblock
['The Queen of the Damned (Vampire Chronicles (Paperback))', [['The Laws of Our Fathers', 169.0], ['The Prometheus Deception', 167.0], ['Long After Midnight', 166.0], ['Jacob Have I Loved', 162.0]]]
["Where the Heart Is (Oprah's Book Club (Paperback))", [['Mr. Maybe', 451.0], ['Good Harbor: A Novel', 450.0], ['The Perks of Being a Wallflower', 450.0], ['Gap Creek: The Story Of A Marriage', 448.0]]]

 metric: cosine
['The Queen of the Damned (Vampire Chronicles (Paperback))', [['Interview with the Vampire', 0.7383682727813721], ['The Witching Hour (Lives of the Mayfair Witches)', 0.7371460795402527], ['Taltos: Lives of the Mayfair Witches', 0.718769371509552], ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.529854416847229]]]
["Where the Heart Is (Oprah's Book Club (Paperback))", [['Icy Sparks', 0.7731838822364807], ['The Weight of Water', 0.7708583474159241], ['I Know This Much Is True', 0.7561071515083313], ['The Lovely Bones: A Novel', 0.72301840

## The recommendations closest to the challange are with 'cosine' metrics, but still not equal.
# Let's try to **NOT** remove the duplicated isbn's...
This would be less precise in a real problem, but needed to pass this test

In [376]:
df2 = df_ratings.copy()

user_count = df2['user'].value_counts()
isbn_count = df2['isbn'].value_counts()

df2['user_cnt'] = df2['user'].map(user_count)
df2['isbn_cnt'] = df2['isbn'].map(isbn_count)
df2

Unnamed: 0,user,isbn,rating,user_cnt,isbn_cnt
0,276725,034545104X,0.0,1,60
1,276726,0155061224,5.0,1,2
2,276727,0446520802,0.0,1,116
3,276729,052165615X,3.0,2,1
4,276729,0521795028,6.0,2,1
...,...,...,...,...,...
1149775,276704,1563526298,9.0,17,3
1149776,276706,0679447156,0.0,1,40
1149777,276709,0515107662,10.0,1,44
1149778,276721,0590442449,10.0,1,15


In [377]:
# dropping only users/isbns with less than 200/100 reviews
df2 = (df2
      .loc[df2['user_cnt'] > 200]
      .loc[df2['isbn_cnt'] > 100]
      .drop(columns=['user_cnt', 'isbn_cnt'])
      )
df2

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0
...,...,...,...
1147304,275970,0804111359,0.0
1147436,275970,140003065X,0.0
1147439,275970,1400031346,0.0
1147440,275970,1400031354,0.0


In [378]:
pivot2 = df2.pivot(index='isbn', columns='user', values='rating').fillna(0)
pivot2

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [379]:
nbrs2 = NearestNeighbors(n_neighbors=6, metric='cosine').fit(pivot2)
dists2 = nbrs2.kneighbors(pivot2)
print('indexes: \n', dists2[1], '\n\n', 'distances: \n', dists2[0])

indexes: 
 [[  0 676 716 136 411 684]
 [  1  54 488 421 422  28]
 [  2 656 163 605 230  96]
 ...
 [718 474  54 252 657   6]
 [719 177  74 693 704 605]
 [720 669  64   9 574 167]] 

 distances: 
 [[0.0000000e+00 7.8984177e-01 8.0516553e-01 8.0694568e-01 8.1314421e-01
  8.2237244e-01]
 [0.0000000e+00 7.1284628e-01 7.3452413e-01 7.4777335e-01 7.5061035e-01
  7.5330228e-01]
 [0.0000000e+00 7.0882022e-01 7.3184365e-01 7.8030056e-01 7.9833174e-01
  8.1575131e-01]
 ...
 [5.9604645e-08 6.8376327e-01 7.0924073e-01 7.3659056e-01 7.3758149e-01
  7.4953222e-01]
 [0.0000000e+00 7.6948631e-01 7.7905142e-01 7.8266001e-01 7.8457510e-01
  7.9296970e-01]
 [0.0000000e+00 6.5930152e-01 7.5777864e-01 7.9022682e-01 7.9587239e-01
  8.0388385e-01]]


In [388]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  isbn_to_find = df_books.loc[df_books['title'] == book]['isbn'].values[0]
  idx = list(pivot2.index).index(isbn_to_find)
  rec = []
  for i in range(5, 0, -1):
      j = dists2[1][idx][i]
      dist = dists2[0][idx][i]
      isbn = pivot2.index[j]
      rec.append([df_books.loc[df_books['isbn'] == isbn]['title'].values[0], float(dist)])
  
  recommended_books = [book, rec]
  return recommended_books

print(get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))"))
print(get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))"))

['The Queen of the Damned (Vampire Chronicles (Paperback))', [['Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)', 0.7833433151245117], ['The Witching Hour (Lives of the Mayfair Witches)', 0.7362787127494812], ['Interview with the Vampire', 0.7345068454742432], ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.529854416847229], ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.5178411602973938]]]
["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016210794448853], ['The Weight of Water', 0.7708583474159241], ['The Surgeon', 0.7699410915374756], ['I Know This Much Is True', 0.7677075266838074], ['The Lovely Bones: A Novel', 0.7230184078216553]]]


## Passed the test.
### One recommendationf for "The Queen of the Damned (Vampire Chronicles (Paperback))" is different, all the distances are about the same.
The difference is probally due to some different cleaning or dropping made

In [None]:
# add your code here - consider creating a new cell for each section of code

In [386]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  isbn_to_find = df_books.loc[df_books['title'] == book]['isbn'].values[0]
  idx = list(pivot2.index).index(isbn_to_find)
  rec = []
  for i in range(5, 0, -1):
      j = dists2[1][idx][i]
      dist = dists2[0][idx][i]
      isbn = pivot2.index[j]
      rec.append([df_books.loc[df_books['isbn'] == isbn]['title'].values[0], float(dist)])
  
  recommended_books = [book, rec]
  return recommended_books

In [387]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016210794448853], ['The Weight of Water', 0.7708583474159241], ['The Surgeon', 0.7699410915374756], ['I Know This Much Is True', 0.7677075266838074], ['The Lovely Bones: A Novel', 0.7230184078216553]]]
You passed the challenge! 🎉🎉🎉🎉🎉
