<a href="https://colab.research.google.com/github/FRED984/book_recommendations/blob/main/book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
 #get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-05-15 13:44:52--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-05-15 13:44:53 (44.4 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
#first we will merge the 2 dataframees based on the isbn
df=pd.merge(df_books,df_ratings,on='isbn')

df = df.drop_duplicates(subset='user', keep='last')

In [5]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit and transform the 'title' column to numerical values
df['title_encoded'] = label_encoder.fit_transform(df['title'])

# You can use 'title_encoded' column in your utility matrix creation or modeling


In [85]:
df.head()

Unnamed: 0,isbn,title,author,user,rating,title_encoded
0,195153448,Classical Mythology,Mark P. O. Morford,2,0.0,7743
9,2005018,Clara Callan,Richard Bruce Wright,123629,9.0,7716
48,399135782,The Kitchen God's Wife,Amy Tan,198013,7.0,42603
52,399135782,The Kitchen God's Wife,Amy Tan,212449,8.0,42603
61,399135782,The Kitchen God's Wife,Amy Tan,252921,8.0,42603


In [6]:
df = df.reset_index()

In [8]:
df.head()

Unnamed: 0,index,isbn,title,author,user,rating,title_encoded
0,0,195153448,Classical Mythology,Mark P. O. Morford,2,0.0,7743
1,9,2005018,Clara Callan,Richard Bruce Wright,123629,9.0,7716
2,48,399135782,The Kitchen God's Wife,Amy Tan,198013,7.0,42603
3,52,399135782,The Kitchen God's Wife,Amy Tan,212449,8.0,42603
4,61,399135782,The Kitchen God's Wife,Amy Tan,252921,8.0,42603


In [7]:
utility_matrix = csr_matrix((df['rating'], (df['user'], df['title_encoded'])),)

In [10]:
utility_matrix = pd.DataFrame.sparse.from_spmatrix(utility_matrix)

In [8]:
print(utility_matrix)

  (2, 7743)	0.0
  (8, 17071)	7.0
  (9, 29807)	0.0
  (10, 28273)	0.0
  (12, 19854)	10.0
  (14, 51673)	0.0
  (16, 48324)	0.0
  (17, 4365)	5.0
  (19, 31301)	7.0
  (20, 7460)	0.0
  (22, 37452)	0.0
  (23, 25051)	0.0
  (26, 34118)	9.0
  (32, 31424)	0.0
  (36, 46353)	0.0
  (39, 19647)	7.0
  (42, 15913)	7.0
  (44, 23465)	0.0
  (51, 46121)	9.0
  (53, 2930)	0.0
  (56, 17939)	9.0
  (64, 23876)	7.0
  (67, 42454)	0.0
  (68, 18064)	0.0
  (69, 37938)	8.0
  :	:
  (278789, 38129)	0.0
  (278795, 13099)	0.0
  (278796, 163)	0.0
  (278798, 12432)	0.0
  (278800, 11635)	9.0
  (278807, 27862)	10.0
  (278813, 13564)	0.0
  (278815, 30693)	0.0
  (278818, 52367)	10.0
  (278819, 5327)	0.0
  (278820, 7940)	3.0
  (278824, 46165)	6.0
  (278825, 46210)	5.0
  (278828, 46165)	8.0
  (278831, 47149)	10.0
  (278832, 7321)	10.0
  (278836, 17778)	8.0
  (278838, 45741)	0.0
  (278843, 44881)	0.0
  (278844, 36839)	6.0
  (278846, 5885)	8.0
  (278849, 34268)	0.0
  (278851, 9177)	10.0
  (278852, 9791)	8.0
  (278854, 21826)	7.0


In [11]:
# add your code here - consider creating a new cell for each section of code
Knn=NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine')

In [12]:
Knn.fit(utility_matrix)

In [13]:
# function to return recommended books
def get_recommends(book=""):
    encoded_value = df.loc[df['title'] == book, 'title_encoded'].iloc[0]
    user_ratings = utility_matrix.toarray()[encoded_value].reshape(1, -1)
    _, indices = Knn.kneighbors(user_ratings)

    recommended_encoded_titles = []
    recommended_books = []
    for i in indices[0]:
        recommended_encoded_titles.extend(utility_matrix.toarray()[i][utility_matrix.toarray()[i] > 0])

    recommended_books_with_distance = []
    for title in recommended_encoded_titles:
        recommended_book = df.loc[df['title_encoded'] == title, 'title'].iloc[0]
        if recommended_book != book:
            recommended_books.append(recommended_book)
            recommended_books_with_distance.append((recommended_book, 0.0))  # Adding a placeholder distance

    return [book] + [recommended_books_with_distance[:4]]  # Return the input book followed by 4 recommended books and distances


In [52]:
print(utility_matrix.toarray()[51325])


[0. 0. 0. ... 0. 0. 0.]


In [14]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [("'48", 0.0), ("$oft Money: The True Power in Our Nation's Capital", 0.0), ("'A Hell of a Place to Lose a Cow': An American Hitchhiking Odyssey", 0.0), ("'A Hell of a Place to Lose a Cow': An American Hitchhiking Odyssey", 0.0)]]
You haven't passed yet. Keep trying!
