In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [15]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [5]:
df_ratings['rating'].value_counts()

0.0     716109
8.0     103736
10.0     78610
7.0      76457
9.0      67541
5.0      50974
6.0      36924
4.0       8904
3.0       5996
2.0       2759
1.0       1770
Name: rating, dtype: int64

In [16]:
df_ratings = df_ratings[df_ratings['rating']!=0]
df_ratings.reset_index(inplace=True, drop=True)
df_ratings['rating'].unique()

array([ 5.,  3.,  6.,  8.,  7., 10.,  9.,  4.,  1.,  2.], dtype=float32)

In [5]:
df_ratings = df_ratings[df_ratings['isbn'].isin(df_books['isbn'])]
df_ratings.reset_index(inplace=True, drop=True)
df_ratings

Unnamed: 0,user,isbn,rating
0,276726,0155061224,5.0
1,276729,052165615X,3.0
2,276729,0521795028,6.0
3,276744,038550120X,7.0
4,276747,0060517794,9.0
...,...,...,...
383847,276704,0743211383,7.0
383848,276704,0806917695,5.0
383849,276704,1563526298,9.0
383850,276709,0515107662,10.0


In [68]:
# add your code here - consider creating a new cell for each section of code
df_books[df_books['title']=="I'll Be Seeing You"]

Unnamed: 0,isbn,title,author
45,671888587,I'll Be Seeing You,Mary Higgins Clark
74047,553567187,I'll Be Seeing You,LURLENE MCDANIEL


In [67]:
df_ratings[df_ratings['isbn']=='0671888587']

Unnamed: 0,user,isbn,rating
2158,277478,0671888587,0.0
3644,278137,0671888587,0.0
9617,39,0671888587,7.0
10729,487,0671888587,6.0
33292,7904,0671888587,0.0
...,...,...,...
1117151,268032,0671888587,8.0
1129862,271195,0671888587,0.0
1131229,271284,0671888587,0.0
1137934,273979,0671888587,0.0


In [7]:
cont_user = df_ratings.user.value_counts()
cont_user

11676     6943
98391     5691
189835    1899
153662    1845
23902     1180
          ... 
114767       1
114771       1
114772       1
114777       1
276721       1
Name: user, Length: 68092, dtype: int64

In [8]:
cont_books = df_ratings.isbn.value_counts()
cont_books

0316666343    707
0971880107    581
0385504209    487
0312195516    383
0060928336    320
             ... 
8420431249      1
840804978X      1
8408020854      1
355175537X      1
0806917695      1
Name: isbn, Length: 149842, dtype: int64

In [9]:
#Obtener los valores que tienen al menos 200 y 100 instancias
user200 = cont_user[cont_user >= 200].index
bokks100 = cont_books[cont_books >= 100].index
user= df_ratings[df_ratings['user'].isin(user200)].copy()
bokks = df_ratings[df_ratings['isbn'].isin(bokks100)].copy()
df_ratings = df_ratings[df_ratings['user'].isin(user200) & df_ratings['isbn'].isin(bokks100)].copy()
df = df_ratings['rating']
df

7005      10.0
8619      10.0
8622       7.0
8630      10.0
8654       8.0
          ... 
376919    10.0
376920     8.0
376921     8.0
376924    10.0
380687    10.0
Name: rating, Length: 1178, dtype: float32

In [13]:
# df_rat =df_ratings['user'].isin(user200)
# df_ratings[df_rat]

In [14]:
#min(bokks.isbn.value_counts())

In [15]:
#min(df.user.value_counts())

In [16]:
#min(df.isbn.value_counts())

In [52]:
df_books[df_books['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"]

Unnamed: 0,isbn,title,author
706,446672211,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts


In [53]:
df_ratings[df_ratings['isbn'] == '0140120831']

Unnamed: 0,user,isbn,rating
12099,7105,140120831,10.0
18651,11676,140120831,6.0
126649,76626,140120831,8.0
151069,92853,140120831,9.0
227920,138546,140120831,9.0
320353,200351,140120831,7.0
363535,229978,140120831,8.0


In [54]:
df_books[df_books['isbn'] == '0140120831']

Unnamed: 0,isbn,title,author


In [20]:
#df[df['isbn']=='0345351525']

In [21]:
#samples = df['rating'].to_list()
#samples.reshape(-1, 1)
#lista = np.array(samples).reshape(1, -1)
#len(samples)
#lista

In [22]:
# neigh = NearestNeighbors(n_neighbors=6, radius=0.4)
# neigh.fit(lista)

In [23]:
#print(neigh.kneighbors([[1., 1., 1.]]))

In [24]:
# isbn = df_books['isbn'][df_books['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"].values
# isbn[0]

In [25]:
# book_rating = df[df['isbn']==isbn[0]]
# book_rating = book_rating['rating'].to_list()
# print(book_rating)
# lista = np.array(book_rating).reshape(-1, 1)
# #print(lista)
# neigh = NearestNeighbors(n_neighbors=5, radius=0.4)
# neigh.fit(lista)
# lista

In [26]:
#rng = neigh.radius_neighbors([[8.]])
#rng

In [27]:
#distances, neighbor_indices = neigh.kneighbors(lista)

In [28]:
# Obtener los libros recomendados basados en los vecinos más cercanos
# recommended_books = []
# listo = []
# for idx, distance in zip(neighbor_indices[0], distances[0]):
#         listo.append([df_books.loc[idx, 'title'], distance])

# recommended_books.append("Where the Heart Is (Oprah's Book Club (Paperback))")
# # for idx, distance in zip(neighbor_indices[0][1:], distances[0][1:]):
# #         listo.append((df_books.loc[idx, 'title'], distance))

# recommended_books = [recommended_books, listo]
# recommended_books


In [29]:
# book_rating = df[df['isbn']==isbn[0]]
# book_rating

In [88]:
df_ratings[df_ratings['rating']==0]

Unnamed: 0,user,isbn,rating


In [31]:
# def get_recommends(book = ""):
#     distances, indices = neigh.kneighbors(lista)

#     # Extract book titles and distances
#     recommended_books = [
#         df_books.loc[idx, 'title'] for idx in indices[0] if df_books.loc[idx, 'title'] != book
#     ]
#     distances = distances[0][1:]  # Exclude the distance to the book itself

#     # Combine titles and distances into a list of lists
#     result = [book, list(zip(recommended_books, distances))]

#     return result
#df_ratings[df_ratings['title']==np.nan]

In [17]:
# isbn = df_books['isbn'][df_books['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"].values
# isbn[0]
# book_rating = df[df['isbn']==isbn[0]]
# book_rating = book_rating['rating']
# #book_rating
# lista = np.array(book_rating).reshape(-1, 1)
# lista
# books_rating = df_ratings['rating'].values
# books_rating = np.array(books_rating).reshape(-1, 1)
# len(books_rating)
correspondencias = dict(zip(df_books['isbn'], df_books['title']))
df_ratings['title'] = df_ratings['isbn'].map(correspondencias)
correspondencias = dict(zip(df_books['isbn'], df_books['author']))
df_ratings['author'] = df_ratings['isbn'].map(correspondencias)
#df_promedios = df_ratings.groupby(['title', 'author'])['rating'].mean().reset_index()
# df_ratings.columns = ['isbn', 'rating']
#df_ratings[df_ratings['rating'].duplicated()]
#df_ratings[df_ratings['isbn']=='Ô½crosoft']
#df_ratings['rating'].reset_index()


In [None]:
df_promedios = df_ratings.pivot_table(index='title', columns='user', values='rating').fillna(0)

  df_promedios = df_ratings.pivot_table(index='title', columns='user', values='rating').fillna(0)


In [None]:
df_promedios[df_promedios['title'].duplicated()]

In [72]:
df_promedios[df_promedios['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"]

Unnamed: 0,title,author,rating
242139,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,4.105983


In [73]:
book_rating = df_promedios[df_promedios['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"]
book_rating = book_rating['rating'].values
# lista = np.array(book_rating).reshape(-1, 1)
# lista
book_rating

array([4.105983], dtype=float32)

In [13]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  recommended_books = [book]
  #print(book)
  listo = []
  #isbn = df_books['isbn'][df_books['title']==book].values
  #book_rating = df_promedios[df_promedios['title']==book]
  #book_rating = book_rating['rating'].values
  #lista = np.array(book_rating).reshape(-1, 1)
  #suma_elementos = sum(lista)
  #promedio = suma_elementos / len(lista)
  #print(promedio[0])
  #todas las notas
  #books_rating = df_promedios['rating'].values
  #books_rating = np.array(books_rating).reshape(-1, 1)

  print('tamaño de los datos, : ', len(df_promedios))
  neigh = NearestNeighbors(n_neighbors=5, radius=1)
  neigh.fit(df_promedios)
  #A = neigh.kneighbors_graph([promedio])
  #A.toarray()
  #plt.plot(A)
  #print(A)
  distances, neighbor_indices=neigh.kneighbors(df_promedios.loc[[book]], 6)
  #distances, neighbor_indices = neigh.kneighbors(lista, 5)
  print("Recommended Books:")
  print("==================")
  for index, value in enumerate(df_promedios.index[neighbor_indices][0][1:]):
    print((index+1),". ",value)
    listo.append([value, distances[0][index+1]])
  del listo[-1]
  print(neighbor_indices)
  print(distances)
  # for value, distance in zip(df_promedios.index[neighbor_indices][0][1:], distances[0][1:]):
  #        listo.append([value, distance])
  recommended_books = [recommended_books, listo]
  #neigh.kneighbors([[1., 1., 1.]])
  return recommended_books

In [14]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

tamaño de los datos, :  134
Recommended Books:
1 .  Along Came a Spider (Alex Cross Novels)
2 .  The Bridges of Madison County
3 .  A Map of the World
4 .  She's Come Undone (Oprah's Book Club)
5 .  Timeline
[[128  13  85   6  70 122]]
[[ 0.         12.56980509 17.94435844 18.89444363 19.54482029 19.74841766]]
[["Where the Heart Is (Oprah's Book Club (Paperback))"], [['Along Came a Spider (Alex Cross Novels)', 12.569805089976535], ['The Bridges of Madison County', 17.944358444926362], ['A Map of the World', 18.894443627691185], ["She's Come Undone (Oprah's Book Club)", 19.544820285692065]]]
tamaño de los datos, :  134
Recommended Books:
1 .  Along Came a Spider (Alex Cross Novels)
2 .  The Bridges of Madison County
3 .  A Map of the World
4 .  She's Come Undone (Oprah's Book Club)
5 .  Timeline
[[128  13  85   6  70 122]]
[[ 0.         12.56980509 17.94435844 18.89444363 19.54482029 19.74841766]]
You haven't passed yet. Keep trying!


  for index, value in enumerate(df_promedios.index[neighbor_indices][0][1:]):
  for index, value in enumerate(df_promedios.index[neighbor_indices][0][1:]):


In [38]:
df_promedios[df_promedios['title']=='I Know This Much Is True']

Unnamed: 0,title,author,rating
51238,I Know This Much Is True,Wally Lamb,8.264151


In [39]:
df_books[df_books['isbn']=='0060987561']

Unnamed: 0,isbn,title,author
3022,60987561,I Know This Much Is True,Wally Lamb


In [40]:
df_promedios['rating'][df_promedios['title']=="Where the Heart Is (Oprah's Book Club (Paperback))"].values

array([8.142373], dtype=float32)

In [41]:
df_ratings['rating'].unique()

array([ 5.,  3.,  6.,  7.,  9.,  8., 10.,  1.,  4.,  2.], dtype=float32)

In [97]:
df_promedios[df_promedios['title']=="I'll Be Seeing You"]

Unnamed: 0,title,author,rating
51543,I'll Be Seeing You,LURLENE MCDANIEL,9.0
51544,I'll Be Seeing You,Mary Higgins Clark,7.555555


In [92]:
df_promedios[df_promedios['title']=="The Weight of Water"]

Unnamed: 0,title,author,rating
123800,The Weight of Water,Anita Shreve,7.695652


In [93]:
df_promedios[df_promedios['title']=='The Surgeon']

Unnamed: 0,title,author,rating
121481,The Surgeon,Francis Roe,4.0
121482,The Surgeon,TESS GERRITSEN,7.852459


In [95]:
df_promedios[df_promedios['title']=='I Know This Much Is True']

Unnamed: 0,title,author,rating
51238,I Know This Much Is True,Wally Lamb,8.264151


In [98]:
df_ratings[df_ratings['title']=="I'll Be Seeing You"]

Unnamed: 0,user,isbn,rating,title,author
2423,39,671888587,7.0,I'll Be Seeing You,Mary Higgins Clark
2890,487,671888587,6.0,I'll Be Seeing You,Mary Higgins Clark
17886,11601,671888587,9.0,I'll Be Seeing You,Mary Higgins Clark
22690,11676,671888587,8.0,I'll Be Seeing You,Mary Higgins Clark
28505,12982,671888587,8.0,I'll Be Seeing You,Mary Higgins Clark
28915,13273,671888587,8.0,I'll Be Seeing You,Mary Higgins Clark
41146,22045,671888587,7.0,I'll Be Seeing You,Mary Higgins Clark
42124,22912,671888587,9.0,I'll Be Seeing You,Mary Higgins Clark
48311,25981,671888587,5.0,I'll Be Seeing You,Mary Higgins Clark
53791,29259,671888587,9.0,I'll Be Seeing You,Mary Higgins Clark


In [99]:
df_ratings[df_ratings['isbn']=="0553567187"]

Unnamed: 0,user,isbn,rating,title,author
68890,37597,553567187,10.0,I'll Be Seeing You,LURLENE MCDANIEL
86080,50143,553567187,7.0,I'll Be Seeing You,LURLENE MCDANIEL
218453,132522,553567187,10.0,I'll Be Seeing You,LURLENE MCDANIEL


In [46]:
df_books[df_books['isbn']=='0140001409']

Unnamed: 0,isbn,title,author
9007,140001409,Cold Comfort Farm,Stella Gibbons


In [47]:
df_books[df_books['isbn']=='0140120831']

Unnamed: 0,isbn,title,author
