In [360]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [361]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [362]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [363]:
# there are some books with more than one isbn.
# so we will use this 'title' column to identify the books
df_ratings['title'] = df_ratings['isbn'].map(df_books.set_index('isbn')['title'])
df_ratings

Unnamed: 0,user,isbn,rating,title
0,276725,034545104X,0.0,Flesh Tones: A Novel
1,276726,0155061224,5.0,Rites of Passage
2,276727,0446520802,0.0,The Notebook
3,276729,052165615X,3.0,Help!: Level 1
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...
...,...,...,...,...
1149775,276704,1563526298,9.0,Get Clark Smart : The Ultimate Guide for the S...
1149776,276706,0679447156,0.0,Eight Weeks to Optimum Health: A Proven Progra...
1149777,276709,0515107662,10.0,The Sherbrooke Bride (Bride Trilogy (Paperback))
1149778,276721,0590442449,10.0,Fourth Grade Rats


In [364]:
user_count = df_ratings['user'].value_counts()
title_count = df_ratings['title'].value_counts()

In [365]:
df_ratings['user_cnt'] = df_ratings['user'].map(user_count)
df_ratings['title_cnt'] = df_ratings['title'].map(title_count)
df_ratings

Unnamed: 0,user,isbn,rating,title,user_cnt,title_cnt
0,276725,034545104X,0.0,Flesh Tones: A Novel,1,60.0
1,276726,0155061224,5.0,Rites of Passage,1,14.0
2,276727,0446520802,0.0,The Notebook,1,650.0
3,276729,052165615X,3.0,Help!: Level 1,2,1.0
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,2,1.0
...,...,...,...,...,...,...
1149775,276704,1563526298,9.0,Get Clark Smart : The Ultimate Guide for the S...,17,3.0
1149776,276706,0679447156,0.0,Eight Weeks to Optimum Health: A Proven Progra...,1,40.0
1149777,276709,0515107662,10.0,The Sherbrooke Bride (Bride Trilogy (Paperback)),1,44.0
1149778,276721,0590442449,10.0,Fourth Grade Rats,1,15.0


In [366]:
# there are some books with more than one review by the same user
# so we sort by rating and drop duplicates
# this way we will keep only the higest rating by the user for the book
df = (df_ratings
      .loc[df_ratings['user_cnt'] > 200]
      .loc[df_ratings['title_cnt'] > 100]
      .drop(columns=['user_cnt', 'title_cnt'])
      .sort_values(by=['rating'], ascending=False)
      )
df.drop_duplicates(subset=['user', 'title'], inplace=True)
df

Unnamed: 0,user,isbn,rating,title
1146737,275970,0385722206,10.0,Balzac and the Little Chinese Seamstress : A N...
1456,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
1146363,275970,0064400557,10.0,Charlotte's Web (Trophy Newbery)
125021,28634,0440241537,10.0,The King of Torts
125024,28634,044651652X,10.0,The Bridges of Madison County
...,...,...,...,...
560095,135045,0684848783,0.0,Tis : A Memoir
560094,135045,068484477X,0.0,STONES FROM THE RIVER
560087,135045,0679731725,0.0,The Remains of the Day (Vintage International)
560057,135045,0671042858,0.0,The Girl Who Loved Tom Gordon


In [391]:
pivot = df.pivot(index='title', columns='user', values='rating').fillna(0)
pivot

user,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [403]:
nbrs = NearestNeighbors(n_neighbors=6, metric='cosine',algorithm='brute').fit(pivot)
distances, indices = nbrs.kneighbors(pivot)
print(indices, '\n\n', distances)

[[  0  65 685 107 621 413]
 [  1  54 245 859 464 800]
 [  2 785 717 690 509 507]
 ...
 [901 477  11 318 688  88]
 [902 516 347 602 582 245]
 [903 481 310 277 207 115]] 

 [[0.         0.69383436 0.72125417 0.72143173 0.7496566  0.755864  ]
 [0.         0.67357194 0.7068114  0.7078737  0.7136514  0.7238433 ]
 [0.         0.6948402  0.7173296  0.7303128  0.73206174 0.73469394]
 ...
 [0.         0.64546585 0.7231893  0.7329485  0.73577476 0.745635  ]
 [0.         0.45760822 0.49539918 0.5255173  0.5484942  0.5655378 ]
 [0.         0.58654845 0.5870569  0.5936792  0.59496003 0.59861374]]


In [404]:
nbrs.kneighbors_graph(pivot).toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [415]:
# "The Queen of the Damned (Vampire Chronicles (Paperback))" isbn: 0345351525 / 0833563505
title_to_find = "The Queen of the Damned (Vampire Chronicles (Paperback))"
idx = list(pivot.index).index(title_to_find)
j = 0
for i in indices[idx]:
    print(i, distances[i][j], pivot.index[i])
    j += 1

759 0.0 The Queen of the Damned (Vampire Chronicles (Paperback))
818 0.50016034 The Vampire Lestat (Vampire Chronicles, Book II)
805 0.5298544 The Tale of the Body Thief (Vampire Chronicles (Paperback))
578 0.69574356 Taltos: Lives of the Mayfair Witches
827 0.69247717 The Witching Hour (Lives of the Mayfair Witches)
324 0.76642215 Interview with the Vampire


In [416]:
# "Where the Heart Is (Oprah's Book Club (Paperback))" isbn: 0345351525 / 0833563505
title_to_find = "Where the Heart Is (Oprah's Book Club (Paperback))"
idx = list(pivot.index).index(title_to_find)
j = 0
for i in indices[idx]:
    print(i, distances[i][j], pivot.index[i])
    j += 1

878 0.0 Where the Heart Is (Oprah's Book Club (Paperback))
103 0.661445 Blue Diary
725 0.75736374 The Lovely Bones: A Novel
311 0.7366455 I Know This Much Is True
824 0.70512605 The Weight of Water
316 0.74290776 Icy Sparks


# The recommendations are not totaly equal to the challange...
# Let's try to **NOT** remove the duplicated isbn's...

In [None]:
# add your code here - consider creating a new cell for each section of code

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):


  return recommended_books

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()