**BOOK RECOMMENDATION**


## Libraries and Utilities

In [43]:
import os
import re
import nltk
import requests
import warnings
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from PIL import Image
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load and Check Data

In [5]:
! pip install -q kaggle

In [6]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"aakaashkb","key":"293308a7ac55607d4423515f7959c976"}'}

In [7]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [8]:
! chmod 600 ~/.kaggle/kaggle.json

In [11]:
! kaggle datasets download "ruchi798/bookcrossing-dataset"

Downloading bookcrossing-dataset.zip to /content
 80% 61.0M/76.1M [00:00<00:00, 89.6MB/s]
100% 76.1M/76.1M [00:00<00:00, 109MB/s] 


In [12]:
! mkdir data
! unzip bookcrossing-dataset.zip -d data

Archive:  bookcrossing-dataset.zip
  inflating: data/Book reviews/Book reviews/BX-Book-Ratings.csv  
  inflating: data/Book reviews/Book reviews/BX-Users.csv  
  inflating: data/Book reviews/Book reviews/BX_Books.csv  
  inflating: data/Books Data with Category Language and Summary/Preprocessed_data.csv  


In [13]:
books = pd.read_csv('/content/data/Books Data with Category Language and Summary/Preprocessed_data.csv')
books.head(3)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l,Summary,Language,Category,city,state,country
0,0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
2,2,11400,"ottawa, ontario, canada",49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],ottawa,ontario,canada


## Preprocessing

In [14]:
df = books.copy()
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop(columns = ['Unnamed: 0','location','isbn','img_s','img_m','city','age','state','Language','country',
                  'year_of_publication'],axis=1,inplace = True) #remove useless cols
df.drop(index=df[df['Category'] == '9'].index, inplace=True) #remove 9 in category
df.drop(index=df[df['rating'] == 0].index, inplace=True) #remove 0 in rating
df['Category'] = df['Category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
df.head(2)

Unnamed: 0,user_id,rating,book_title,book_author,publisher,img_l,Summary,Category
1,8,5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",Actresses
4,67544,8,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",Actresses


## Item-Based Collaborative Filtering

In [34]:
def item_based_recommender(book_title):

  book_title = str(book_title)

  if book_title in df['book_title'].values:
    rating_counts = pd.DataFrame(df['book_title'].value_counts())
    rare_books = rating_counts[rating_counts['book_title'] <= 180].index
    common_books = df[~df['book_title'].isin(rare_books)]

    if book_title in rare_books:
      random = pd.Series(common_books['book_title'].unique()).sample(2).values
      print('There are no recommendations for this book')
      print('Try: \n')
      print('{}'.format(random[0]),'\n')
      print('{}'.format(random[1]),'\n')
    else:
      user_book_df = common_books.pivot_table(index=['user_id'],
                                    columns=['book_title'],
                                    values='rating')

    book = user_book_df[book_title]
    recom_data = pd.DataFrame(user_book_df.corrwith(book).sort_values(ascending=False)).reset_index(drop=False)

    if book_title in [book for book in recom_data['book_title']]:
      recom_data = recom_data.drop(recom_data[recom_data['book_title'] == book_title].index[0])

    low_rating = []
    for i in recom_data['book_title']:
      if df[df['book_title'] == i]['rating'].mean() < 5:
        low_rating.append(i)

    if recom_data.shape[0] - len(low_rating) > 5:
      recom_data = recom_data[~recom_data['book_title'].isin(low_rating)]

    recom_data = recom_data[0:5]
    recom_data.columns = ['book_title','corr']
    print("Suggested Books: ")
    for i in range(len(recom_data['book_title'].tolist())):
        rating = round(df[df['book_title'] == recom_data['book_title'].tolist()[i]]['rating'].mean(),1)
        print(f"--> {recom_data['book_title'].tolist()[i]} (Rating :{rating})")

  else:
    print('Cant find book in dataset, please check spelling')



## Content-Based Collaborative Filtering (Based on Title, Author, Publisher, Category)

In [36]:
def content_based_recommender(book_title):
  book_title = str(book_title)

  if book_title in df['book_title'].values:
    rating_counts = pd.DataFrame(df['book_title'].value_counts())
    rare_books = rating_counts[rating_counts['book_title'] <= 100].index
    common_books = df[~df['book_title'].isin(rare_books)]

    if book_title in rare_books:
      random = pd.Series(common_books['book_title'].unique()).sample(2).values
      print('There are no recommendations for this book')
      print('Try: \n')
      print('{}'.format(random[0]),'\n')
      print('{}'.format(random[1]),'\n')

    else:
      common_books = common_books.drop_duplicates(subset=['book_title'])
      common_books.reset_index(inplace= True)
      common_books['index'] = [i for i in range(common_books.shape[0])]
      target_cols = ['book_title','book_author','publisher','Category']
      common_books['combined_features'] = [''.join(common_books[target_cols].iloc[i,].values) for i in range(common_books[target_cols].shape[0])]
      cv = CountVectorizer()
      count_matrix = cv.fit_transform(common_books['combined_features'])

      cosine_sim = cosine_similarity(count_matrix)
      index = common_books[common_books['book_title'] == book_title]['index'].values[0]
      sim_books = list(enumerate(cosine_sim[index]))
      sorted_sim_books = sorted(sim_books,key=lambda x:x[1],reverse=True)[1:6]

      books = []
      for i in range(len(sorted_sim_books)):
        books.append(common_books[common_books['index'] == sorted_sim_books[i][0]]['book_title'].item())

      print("Suggested Books: ")
      for i in range(len(books)):
        rating = round(df[df['book_title'] == books[i]]['rating'].mean(),1)
        print(f"--> {books[i]} (Rating :{rating})")

  else:
    print('Cant find book in dataset, please check spelling')

## Content-Based Collaborative Filtering (Based on Summary)

In [46]:
def content_based_recommender_summary(book_title):
  book_title = str(book_title)

  if book_title in df['book_title'].values:
    rating_counts = pd.DataFrame(df['book_title'].value_counts())
    rare_books = rating_counts[rating_counts['book_title'] <= 100].index
    common_books = df[~df['book_title'].isin(rare_books)]

    if book_title in rare_books:
      random = pd.Series(common_books['book_title'].unique()).sample(2).values
      print('There are no recommendations for this book')
      print('Try: \n')
      print('{}'.format(random[0]),'\n')
      print('{}'.format(random[1]),'\n')

    else:
      common_books = common_books.drop_duplicates(subset=['book_title'])
      common_books.reset_index(inplace= True)
      common_books['index'] = [i for i in range(common_books.shape[0])]

      summary_filtered = []
      for i in common_books['Summary']:
        i = re.sub("[^a-zA-Z]"," ",i).lower()
        i = nltk.word_tokenize(i)
        i = [word for word in i if not word in set(stopwords.words("english"))]
        i = " ".join(i)
        summary_filtered.append(i)

      common_books['Summary'] = summary_filtered
      cv = CountVectorizer()
      count_matrix = cv.fit_transform(common_books['Summary'])
      cosine_sim = cosine_similarity(count_matrix)
      index = common_books[common_books['book_title'] == book_title]['index'].values[0]
      sim_books = list(enumerate(cosine_sim[index]))
      sorted_sim_books = sorted(sim_books,key=lambda x:x[1],reverse=True)[1:6]

      books = []
      for i in range(len(sorted_sim_books)):
        books.append(common_books[common_books['index'] == sorted_sim_books[i][0]]['book_title'].item())

      print("Suggested Books: ")
      for i in range(len(books)):
        rating = round(df[df['book_title'] == books[i]]['rating'].mean(),1)
        print(f"--> {books[i]} (Rating :{rating})")


  else:
    print('Cant find book in dataset, please check spelling')

## Custom Recommender

In [40]:
def custom_recommender(book_title):

  book_title = str(book_title)
  if book_title in df['book_title'].values:
    rating_counts = pd.DataFrame(df['book_title'].value_counts())
    rare_books = rating_counts[rating_counts['book_title'] <= 180].index
    common_books = df[~df['book_title'].isin(rare_books)]

    if book_title in rare_books:
      random = pd.Series(common_books['book_title'].unique()).sample(2).values
      print('There are no recommendations for this book')
      print('Try: \n')
      print('{}'.format(random[0]),'\n')
      print('{}'.format(random[1]),'\n')

    else:
      #ITEM-BASED
      user_book_df = common_books.pivot_table(index=['user_id'],columns=['book_title'], values='rating')
      book = user_book_df[book_title]
      recom_data = pd.DataFrame(user_book_df.corrwith(book).sort_values(ascending=False)).reset_index(drop=False)

      if book_title in [book for book in recom_data['book_title']]:
        recom_data = recom_data.drop(recom_data[recom_data['book_title'] == book_title].index[0])

      low_rating = []
      for i in recom_data['book_title']:
        if df[df['book_title'] == i]['rating'].mean() < 5:
          low_rating.append(i)

      if recom_data.shape[0] - len(low_rating) > 5:
        recom_data = recom_data[~recom_data['book_title'].isin(low_rating)]

      recom_data = recom_data[0:1]
      recom_data.columns = ['book_title','corr']
      recommended_books = []
      for i in recom_data['book_title']:
        recommended_books.append(i)

      df_new = df[~df['book_title'].isin(recommended_books)]

      #CONTENT-BASED (Title, Author, Publisher, Category)
      rating_counts = pd.DataFrame(df_new['book_title'].value_counts())
      rare_books = rating_counts[rating_counts['book_title'] <= 100].index

      common_books = df_new[~df_new['book_title'].isin(rare_books)]
      common_books = common_books.drop_duplicates(subset=['book_title'])
      common_books.reset_index(inplace= True)
      common_books['index'] = [i for i in range(common_books.shape[0])]
      target_cols = ['book_title','book_author','publisher','Category']
      common_books['combined_features'] = [''.join(common_books[target_cols].iloc[i,].values) for i in range(common_books[target_cols].shape[0])]
      cv = CountVectorizer()
      count_matrix = cv.fit_transform(common_books['combined_features'])
      cosine_sim = cosine_similarity(count_matrix)
      index = common_books[common_books['book_title'] == book_title]['index'].values[0]
      sim_books = list(enumerate(cosine_sim[index]))
      sorted_sim_books = sorted(sim_books,key=lambda x:x[1],reverse=True)[1:2]

      books = []
      for i in range(len(sorted_sim_books)):
        books.append(common_books[common_books['index'] == sorted_sim_books[i][0]]['book_title'].item())

      for i in books:
        recommended_books.append(i)

      df_new = df_new[~df_new['book_title'].isin(recommended_books)]

      #CONTENT-BASED (Summary)

      rating_counts = pd.DataFrame(df_new['book_title'].value_counts())
      rare_books = rating_counts[rating_counts['book_title'] <= 100].index
      common_books = df_new[~df_new['book_title'].isin(rare_books)]

      common_books = common_books.drop_duplicates(subset=['book_title'])
      common_books.reset_index(inplace= True)
      common_books['index'] = [i for i in range(common_books.shape[0])]

      summary_filtered = []
      for i in common_books['Summary']:
        i = re.sub("[^a-zA-Z]"," ",i).lower()
        i = nltk.word_tokenize(i)
        i = [word for word in i if not word in set(stopwords.words("english"))]
        i = " ".join(i)
        summary_filtered.append(i)

      common_books['Summary'] = summary_filtered
      cv = CountVectorizer()
      count_matrix = cv.fit_transform(common_books['Summary'])
      cosine_sim = cosine_similarity(count_matrix)
      index = common_books[common_books['book_title'] == book_title]['index'].values[0]
      sim_books = list(enumerate(cosine_sim[index]))
      sorted_sim_books2 = sorted(sim_books,key=lambda x:x[1],reverse=True)[1:4]
      sorted_sim_books = sorted_sim_books2[:2]

      summary_books = []
      for i in range(len(sorted_sim_books)):
        summary_books.append(common_books[common_books['index'] == sorted_sim_books[i][0]]['book_title'].item())
      for i in summary_books:
        recommended_books.append(i)

      df_new = df_new[~df_new['book_title'].isin(recommended_books)]

      #TOP RATED OF CATEGORY

      category = common_books[common_books['book_title'] ==
      book_title]['Category'].values[0]
      top_rated = common_books[common_books['Category'] ==
      category].groupby('book_title').agg({'rating':'mean'}).reset_index()
      if top_rated.shape[0] == 1:
        recommended_books.append(common_books[common_books['index'] ==
        sorted_sim_books2[2][0]]['book_title'].item())

      else:
        top_rated.drop(top_rated[top_rated['book_title'] == book_title].index[0],inplace=True)
        top_rated = top_rated.sort_values('rating',ascending=False).iloc[:1]['book_title'].values[0]
        recommended_books.append(top_rated)

      print("Suggested Books: ")
      for i in range(len(recommended_books)):
        rating = round(df[df['book_title'] == recommended_books[i]]['rating'].mean(),1)
        print(f"--> {recommended_books[i]} (Rating :{rating})")


  else:
    print('Cant find book in dataset, please check spelling')

## Recommendations

In [35]:
item_based_recommender("Girl with a Pearl Earring")

Suggested Books: 
--> Timeline (Rating :7.8)
--> The Street Lawyer (Rating :7.6)
--> A Time to Kill (Rating :8.0)
--> The Testament (Rating :7.5)
--> The Fellowship of the Ring (The Lord of the Rings, Part 1) (Rating :8.9)


In [37]:
content_based_recommender("Girl with a Pearl Earring")

Suggested Books: 
--> Interview with the Vampire (Rating :7.8)
--> Airframe (Rating :7.5)
--> Timeline (Rating :7.8)
--> Chocolat (Rating :8.0)
--> Sphere (Rating :7.4)


In [47]:
content_based_recommender_summary("Girl with a Pearl Earring")

Suggested Books: 
--> The Book of Ruth (Oprah's Book Club (Paperback)) (Rating :7.5)
--> The Girls' Guide to Hunting and Fishing (Rating :6.9)
--> The God of Small Things (Rating :7.7)
--> SHIPPING NEWS (Rating :7.7)
--> Empire Falls (Rating :7.6)


In [45]:
custom_recommender("Girl with a Pearl Earring")

Suggested Books: 
--> Timeline (Rating :7.8)
--> Interview with the Vampire (Rating :7.8)
--> The Book of Ruth (Oprah's Book Club (Paperback)) (Rating :7.5)
--> The Girls' Guide to Hunting and Fishing (Rating :6.9)
--> Suzanne's Diary for Nicholas (Rating :7.6)
