## Recommender System

In [213]:
import warnings
warnings.filterwarnings('ignore')
import os 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [214]:
file= 'books.csv'
# Load the dataset
df = pd.read_csv(file, on_bad_lines='skip')
df.head(3)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic


In [215]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11123.0,11123.0,11123.0,11123.0,11123.0,11123.0
mean,21310.86,3.93,9759880247639.15,336.41,17942.85,542.05
std,13094.73,0.35,442975846058.35,241.15,112499.15,2576.62
min,1.0,0.0,8987059752.0,0.0,0.0,0.0
25%,10277.5,3.77,9780345453803.5,192.0,104.0,9.0
50%,20287.0,3.96,9780582461536.0,299.0,745.0,47.0
75%,32104.5,4.14,9780872208045.5,416.0,5000.5,238.0
max,45641.0,5.0,9790007672386.0,6576.0,4597666.0,94265.0


####  Popularity-based Recommender - Create a function named Popularity Recommender and use it to recommend books based on popularity

In [216]:
def popularity_recommender(df, percentile = 0.9, top_n = 10):
    C = df['average_rating'].mean() # mean rating across all books
    m = df['average_rating'].quantile(percentile) # minimum number of votes to be considered
    qualified = df[df['ratings_count'] >= m].copy() # filter the qualifying books
 
    #compute weighted rating
    def weighted_rating(x, m=m, C=C):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)
 
    qualified['score'] = qualified.apply(weighted_rating, axis=1) 
    qualified = qualified.sort_values('score', ascending=False) # sort books
    return qualified[['title','authors','average_rating','ratings_count','score']].head(top_n)

recommended_top_books = popularity_recommender(df, percentile=0.9, top_n=10)
print(recommended_top_books)

                                                   title                      authors  average_rating  ratings_count  score
6587                      The Complete Calvin and Hobbes               Bill Watterson            4.82          32213   4.82
4      Harry Potter Boxed Set  Books 1-5 (Harry Potte...   J.K. Rowling/Mary GrandPré            4.78          41428   4.78
6589       It's a Magical World (Calvin and Hobbes  #11)               Bill Watterson            4.76          23875   4.76
6           Harry Potter Collection (Harry Potter  #1-6)                 J.K. Rowling            4.73          28242   4.73
6590   Homicidal Psycho Jungle Cat (Calvin and Hobbes...               Bill Watterson            4.72          15365   4.72
6593   Calvin and Hobbes: Sunday Pages 1985-1995: An ...               Bill Watterson            4.71           3613   4.71
1848                                         Early Color  Saul Leiter/Martin Harrison            4.73            144   4.71
5614    

####  Content-based Recommender -Create a function named Content-based Recommender and use it to recommend books based on content.


In [217]:
#Instantiate a new Vectorizer object 
tfidf = TfidfVectorizer(stop_words='english') #stop_words='english' removes common words that do not contribute to the meaning of the text

#Replace NaN with an empty string and transform the titles of books in our dataset into the matrix
df['authors'] = df['authors'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['authors'])

In [218]:
tfidf_matrix.shape 

(11123, 8448)

In [219]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [220]:
# Re-create the indices of our list of books by removing any duplicates if required
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [221]:
tfidf_matrix.size

38650

In [222]:
def ContentBasedRecommender(title, indices, distance_matrix):

    if title not in indices:
        raise ValueError(f"Title '{title}' not found in the dataset, please check the title and try again.")
    
    
    id_ = indices[title] #Fetch the index of the movie we will enter
    
    distances = list(enumerate(distance_matrix[id_])) 
    
    #sort by the distance function, which is in column[1]
    distances = sorted(distances, key=lambda x: x[1], reverse = True) 
    
    distances = distances[1:6] # Get the 5 best scores , not including itself
    print(distances)
    
    # get the indices of the top 5
    recommendations = [distance[0] for distance in distances] 
    
    # return those recommendation names by pulling title from the given 5 indices
    return df['title'].iloc[recommendations] 

In [223]:
ContentBasedRecommender('Study Bible: NIV', indices, cosine_sim)

[(924, np.float64(0.6365698852036981)), (925, np.float64(0.6365698852036981)), (5352, np.float64(0.6365698852036981)), (5353, np.float64(0.6365698852036981)), (926, np.float64(0.6202036099361221))]


924                                  Alcoholics Anonymous
925                                  Alcoholics Anonymous
5352                                The Epic of Gilgamesh
5353                                The Epic of Gilgamesh
926     The Twelve Steps & Twelve Traditions of Overea...
Name: title, dtype: object

In [224]:
ContentBasedRecommender('Rising from the Plains', indices, cosine_sim)

[(46, np.float64(1.0000000000000002)), (47, np.float64(1.0000000000000002)), (48, np.float64(1.0000000000000002)), (49, np.float64(1.0000000000000002)), (50, np.float64(1.0000000000000002))]


46         Heirs of General Practice
47             The Control of Nature
48        Annals of the Former World
49           Coming Into the Country
50    La Place de la Concorde Suisse
Name: title, dtype: object