<a href="https://colab.research.google.com/github/GMayumi/FreeCodeCamp/blob/main/fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [209]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [210]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-11-11 10:01:35--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2024-11-11 10:01:36 (202 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: BX-Book-Ratings.csv     
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: BX-Books.csv            
replace BX-Users.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: BX-Users.csv            


In [211]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [220]:
# add your code here - consider creating a new cell for each section of code
print("============================== df_books head ==============================")
print(df_books.head())
print("\n============================== df_books info ==============================")
print(df_books.info())
print("\n============================== df_ratings head ==============================")
print(df_ratings.head())
print("\n============================== df_ratings info ==============================")
print(df_ratings.info())

         isbn                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  
0    Mark P. O. Morford  
1  Richard Bruce Wright  
2          Carlo D'Este  
3      Gina Bari Kolata  
4       E. J. W. Barber  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271377 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB
None

     user        isbn  rating
0  276725  034545104X     0.0
1  276726  0155061224     5.0
2  

In [221]:
#To ensure statistical significance, remove from the dataset users with less than 200 ratings and books with less than 100 ratings.
df_ratings_copy = df_ratings

count_users = df_ratings_copy["user"].value_counts()
count_ratings = df_ratings_copy["isbn"].value_counts()

mask_users = df_ratings_copy["user"].isin(count_users[count_users >= 200].index)
mask_ratings = df_ratings_copy["isbn"].isin(count_ratings[count_ratings >= 100].index)

df_ratings_new = df_ratings_copy[mask_users & mask_ratings].drop_duplicates(["isbn", "user"])

print("\n============================== df_ratings_new head ==============================")
print(df_ratings_new.head())
print("\n============================== df_ratings_new info ==============================")
print(df_ratings_new.info())
print("============================== df_ratings_new more info ==============================")
print("users = ", df_ratings_new["user"].nunique(), ", isbns = ", df_ratings_new["isbn"].nunique())


        user        isbn  rating
1456  277427  002542730X    10.0
1469  277427  0060930535     0.0
1471  277427  0060934417     0.0
1474  277427  0061009059     9.0
1484  277427  0140067477     0.0

<class 'pandas.core.frame.DataFrame'>
Index: 49781 entries, 1456 to 1147441
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   isbn    49781 non-null  object 
 2   rating  49781 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 1.1+ MB
None
users =  888 , isbns =  731


In [222]:
#Changing isbn for title
df_ratings_with_title = pd.merge(df_ratings_new, df_books, on="isbn", how="left")
df_ratings_with_title = df_ratings_with_title[["user", "rating", "title"]]

print("============================== df_ratings_with_title head ==============================")
print(df_ratings_with_title.head())
print("\n============================== df_ratings_with_title info ==============================")
print(df_ratings_with_title.info())

     user  rating                                              title
0  277427    10.0  Politically Correct Bedtime Stories: Modern Ta...
1  277427     0.0                      The Poisonwood Bible: A Novel
2  277427     0.0                                 Bel Canto: A Novel
3  277427     9.0  One for the Money (Stephanie Plum Novels (Pape...
4  277427     0.0                                    The Tao of Pooh

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49781 entries, 0 to 49780
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   rating  49781 non-null  float32
 2   title   49517 non-null  object 
dtypes: float32(1), int32(1), object(1)
memory usage: 778.0+ KB
None


In [223]:
#Changing layout to prepare for NearestNeighbors
df = df_ratings_with_title.pivot_table(index=['title'],columns=['user'],values='rating').fillna(0)

print("============================== df head ==============================")
print(df.head())
print("\n============================== df info ==============================")
print(df.info())

user                                                254     2276    2766    \
title                                                                        
1984                                                   9.0     0.0     0.0   
1st to Die: A Novel                                    0.0     0.0     0.0   
2nd Chance                                             0.0    10.0     0.0   
4 Blondes                                              0.0     0.0     0.0   
A Beautiful Mind: The Life of Mathematical Geni...     0.0     0.0     0.0   

user                                                2977    3363    4017    \
title                                                                        
1984                                                   0.0     0.0     0.0   
1st to Die: A Novel                                    0.0     0.0     0.0   
2nd Chance                                             0.0     0.0     0.0   
4 Blondes                                              0.0     

In [225]:
model = NearestNeighbors(algorithm="brute", metric="cosine")
model.fit(df.values)

In [235]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  recommended_books = []
  distance, index = model.kneighbors([df.loc[book].values], n_neighbors=6) #get one more because it includes itself
  for i in range(len(distance[0])):
    title = df.index[index[0][i]]
    if title != book:
      recommended_books.append([title, distance[0][i]])

  recommended_books = [book, recommended_books[::-1]]
  return recommended_books
get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['Catch 22', 0.7939835],
  ['The Witching Hour (Lives of the Mayfair Witches)', 0.74486566],
  ['Interview with the Vampire', 0.73450685],
  ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.53763384],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.51784116]]]

In [236]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.7677075], ['The Lovely Bones: A Novel', 0.7234864]]]
You passed the challenge! 🎉🎉🎉🎉🎉
