In [137]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [138]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = './archive/Books.csv'
ratings_filename = './archive/Ratings.csv'

Der Befehl "wget" ist entweder falsch geschrieben oder
konnte nicht gefunden werden.
Der Befehl "unzip" ist entweder falsch geschrieben oder
konnte nicht gefunden werden.


In [219]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [220]:
print(f'Columns Books: {list(df_books.columns)} Dataset Len: {df_books.shape[0]}')
print(f'Columns Ratings: {list(df_ratings.columns)} Dataset Len: {df_ratings.shape[0]}')

Columns Books: ['isbn', 'title', 'author'] Dataset Len: 271379
Columns Ratings: ['user', 'isbn', 'rating'] Dataset Len: 1149780


In [221]:
df_books.drop_duplicates(inplace=True)
print(f"Book duplicates: {df_books[df_books.duplicated]}")
print(f"Ratings duplicates: {df_ratings[df_ratings.duplicated]}")
print(f"Books Null val: {df_books.isna().values.sum()}")
print(f"Ratings Null val: {df_ratings.isna().values.sum()}")

Book duplicates: Empty DataFrame
Columns: [isbn, title, author]
Index: []
Ratings duplicates: Empty DataFrame
Columns: [user, isbn, rating]
Index: []
Books Null val: 2
Ratings Null val: 0


In [222]:
print(f"Size before removing: {df_books.shape[0]}")
df_books.dropna(inplace=True)
print(f"Size after removing: {df_books.shape[0]}")

Size before removing: 271378
Size after removing: 271376


In [229]:
# remove users with less than 200 votes

ratings = df_ratings['user'].value_counts()

df_ratings_rm = df_ratings[df_ratings['user'].isin(ratings[ratings >= 200].index)]

df_ratings_rm.shape

(527556, 3)

In [230]:
# remove books with less then 100 votes

ratings = df_ratings['isbn'].value_counts()

df_ratings_rm = df_ratings_rm[df_ratings_rm['isbn'].isin(ratings[ratings >= 100].index)]

df_ratings_rm.shape

(49781, 3)

In [None]:
df_train = df_ratings_rm.pivot_table(index=['user'],columns=['isbn'],values='rating').fillna(0).T
df_train.head()

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_train.index = df_train.join(df_books.set_index('isbn'))['title']

In [254]:
knn_model = NearestNeighbors(metric='cosine')

In [255]:
knn_model.fit(df_train.values)

In [None]:
title = 'The Queen of the Damned (Vampire Chronicles (Paperback))'

In [256]:
distances, indices = knn_model.kneighbors([df_train.loc[title].values], n_neighbors=6)

In [258]:
distances

array([[0.        , 0.51784116, 0.5376338 , 0.73450685, 0.74486566,
        0.7939835 ]], dtype=float32)

In [None]:
pd.DataFrame({
    'title'   : df_train.iloc[indices[0]].index.values,
    'distance': distances[0]
}) \
.sort_values(by='distance', ascending=False)

Unnamed: 0,title,distance
5,Catch 22,0.793984
4,The Witching Hour (Lives of the Mayfair Witches),0.744866
3,Interview with the Vampire,0.734507
2,The Tale of the Body Thief (Vampire Chronicles...,0.537634
1,"The Vampire Lestat (Vampire Chronicles, Book II)",0.517841
0,The Queen of the Damned (Vampire Chronicles (P...,0.0


In [312]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  
  recommended_books = []
  
  if book != "":
    distances, indices = knn_model.kneighbors([df_train.loc[book].values], n_neighbors=6)
    titles = df_train.iloc[indices[0]].index.values
       
    recommended_books.append(book)
      
    recommended_books.append(list(zip(titles[1:], distances[0][1:])))
    
    recommended_books[1].reverse()

  return recommended_books

In [313]:
books = get_recommends()
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

[]
You passed the challenge! 🎉🎉🎉🎉🎉
