<a href="https://colab.research.google.com/github/Bambani2003/Data_Science_Projects/blob/main/Recommender_Systems/Book_Recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **USING RECOMMENDER SYSTEMS TO SUGGEST BOOKS TO USERS.**

In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3156236 sha256=215e033437637018990c8cdd29669b9fd152a33efa560b923765ad1c514b70d8
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
# Import modules

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

Mounted at /content/drive


In [3]:
# Load the datasets

books = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DataSets/goodbooks-10k-master/books.csv')
ratings = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DataSets/goodbooks-10k-master/ratings.csv')
book_tags = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DataSets/goodbooks-10k-master/book_tags.csv')
tags = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DataSets/goodbooks-10k-master/tags.csv')

**EDA**

In [4]:
# Replacing NAN values and converting float to int

books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [5]:
# Removing duplicates and users with less than 3 ratings

ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [6]:
# Add the title of the book to the new_ratings data set

new_ratings['title'] = books.set_index('book_id').title.loc[new_ratings.book_id].values
new_ratings.head(10)

Unnamed: 0,user_id,book_id,rating,title
0,1,258,5,The Shadow of the Wind (The Cemetery of Forgot...
1,2,4081,4,I am Charlotte Simmons
2,2,260,5,How to Win Friends and Influence People
3,2,9296,5,The Drama of the Gifted Child: The Search for ...
4,2,2318,3,The Millionaire Next Door: The Surprising Secr...
5,2,26,4,"The Da Vinci Code (Robert Langdon, #2)"
6,2,315,3,Who Moved My Cheese?
7,2,33,4,Memoirs of a Geisha
8,2,301,5,Heart of Darkness
9,2,2686,5,Blue Ocean Strategy: How To Create Uncontested...


**Simple Recommneder using Weighted Ratings**

In [7]:
# Weighted Rating (WR) =  (v/(v+m)*R)+(m/(v+m)*C)

v = books['ratings_count']
m = books['ratings_count'].quantile(0.95)
R = books['average_rating']
C = books['average_rating'].mean()
W = (R*v + C*m) / (v + m)
books['weighted_rating'] = W
qualified  = books.sort_values('weighted_rating', ascending=False).head(250)

In [8]:
# The highest rated books

qualified[['title', 'authors', 'average_rating', 'weighted_rating']].head(15)

Unnamed: 0,title,authors,average_rating,weighted_rating
24,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",4.61,4.555956
26,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",4.54,4.490428
17,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",4.53,4.48509
23,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",4.53,4.483227
1,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",4.44,4.424365
20,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",4.46,4.419054
30,The Help,Kathryn Stockett,4.45,4.405158
38,"A Game of Thrones (A Song of Ice and Fire, #1)",George R.R. Martin,4.45,4.398759
134,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,4.54,4.396645
421,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,4.74,4.391147


**Inference:**
* J.K. Rowling's Harry Potter Books are very highly rated.
* The chart also indicates a strong preference of users towards particular genres (adventure, fantasy, history) and authors (JK Rowling, Tolkien, RR Martin).

**Content Based Filtering and Recommender**

In [9]:
# Removing the spaces and converting authors to lowercase to avoid confusuion
books['authors'] = books['authors'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
books

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,weighted_rating
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,"[s, u, z, a, n, n, e, , c, o, l, l, i, n, s]",2008,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,4.328370
1,2,3,3,4640799,491,439554934,9.780440e+12,"[j, ., k, ., , r, o, w, l, i, n, g, ,, , m, a,...",1997,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,4.424365
2,3,41865,41865,3212258,226,316015849,9.780316e+12,"[s, t, e, p, h, e, n, i, e, , m, e, y, e, r]",2005,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,3.588247
3,4,2657,2657,3275794,487,61120081,9.780061e+12,"[h, a, r, p, e, r, , l, e, e]",1960,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,4.237463
4,5,4671,4671,245494,1356,743273567,9.780743e+12,"[f, ., , s, c, o, t, t, , f, i, t, z, g, e, r,...",1925,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,3.896700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9.780441e+12,"[i, l, o, n, a, , a, n, d, r, e, w, s]",2010,Bayou Moon,...,18856,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...,4.010241
9996,9997,208324,208324,1084709,19,067973371X,9.780680e+12,"[r, o, b, e, r, t, , a, ., , c, a, r, o]",1990,Means of Ascent,...,12952,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,4.019225
9997,9998,77431,77431,2393986,60,039330762X,9.780393e+12,"[p, a, t, r, i, c, k, , o, ', b, r, i, a, n]",1977,The Mauritius Command,...,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,4.020407
9998,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,"[p, e, g, g, y, , o, r, e, n, s, t, e, i, n]",2011,Cinderella Ate My Daughter: Dispatches from th...,...,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...,3.980333


In [10]:
# Combining books with their corresponding genres

def get_genres(x):
    t = book_tags[book_tags.goodreads_book_id==x]
    return [i.lower().replace(" ", "") for i in tags.tag_name.loc[t.tag_id].values]

books['genres'] = books.book_id.apply(get_genres)
books['soup'] = books.apply(lambda x: ' '.join([x['title']] + x['authors'] + x['genres']), axis=1)
books.soup.head()

0    The Hunger Games (The Hunger Games, #1) s u z ...
1    Harry Potter and the Sorcerer's Stone (Harry P...
2    Twilight (Twilight, #1) s t e p h e n i e  m e...
3             To Kill a Mockingbird h a r p e r  l e e
4    The Great Gatsby f .  s c o t t  f i t z g e r...
Name: soup, dtype: object

In [11]:
# Count Vectorizer to create count matrix.

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(books['soup'])

In [12]:
# using Cosine Similarity to calculate the similarity between two books
# cosine(x,y)=(x.y⊺)/(||x||.||y||)

cosine_sim = cosine_similarity(count_matrix, count_matrix)
indices = pd.Series(books.index, index=books['title'])
titles = books['title']
def get_recommendations(title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    book_indices = [i[0] for i in sim_scores]
    return list(titles.iloc[book_indices].values)[:n]
get_recommendations("The E-Myth Revisited: Why Most Small Businesses Don't Work and What to Do About It")

["Don't You Cry",
 "Don't Go",
 'The Last Don',
 'Another Fine Myth (Myth Adventures, #1)',
 'The Power of Myth',
 'Small Island',
 "Don't Look Back",
 'The Beauty Myth',
 "Don't Let Me Go",
 'A Work in Progress']

**Till now, our recommender system included only the ratings. Now we shall implement recommendations based on popularity and good critical responses**

In [13]:
# Take the top 30 books based on similarity scores and calculate the vote of the 60th percentile book.
# Use this as the value of m, to calculate the weighted rating of each book using IMDB's formula

def improved_recommendations(title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    book_indices = [i[0] for i in sim_scores]
    df = books.iloc[book_indices][['title', 'ratings_count', 'average_rating', 'weighted_rating']]

    v = df['ratings_count']
    m = df['ratings_count'].quantile(0.60)
    R = df['average_rating']
    C = df['average_rating'].mean()
    df['weighted_rating'] = (R*v + C*m) / (v + m)

    qualified = df[df['ratings_count'] >= m]
    qualified = qualified.sort_values('weighted_rating', ascending=False)
    return qualified.head(n)

In [14]:
improved_recommendations("The E-Myth Revisited: Why Most Small Businesses Don't Work and What to Do About It")

Unnamed: 0,title,ratings_count,average_rating,weighted_rating
1498,Small Great Things,73745,4.35,4.245367
993,All Creatures Great and Small (All Creatures G...,64779,4.31,4.208545
2902,The Power of Myth,32337,4.3,4.151534
3662,"Squire (Protector of the Small, #3)",37110,4.27,4.145598
3496,"First Test (Protector of the Small, #1)",40036,4.24,4.133825
4193,"Page (Protector of the Small, #2)",33405,4.22,4.112543
284,"The Rosie Project (Don Tillman, #1)",251703,4.01,4.008572
1329,Don't Sweat the Small Stuff ... and it's all s...,63324,3.95,3.965469
1755,M.C. Escher: The Graphic Work,50047,3.93,3.955661
1448,Notes from a Small Island,66947,3.91,3.937576


**Collaborative Filtering and Recommendations**

In [15]:
# Creating the model for the dataset

reader = Reader()
data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset);

In [16]:
# Ratings given by user 10

new_ratings[new_ratings['user_id'] == 10]

Unnamed: 0,user_id,book_id,rating,title
3734,10,11,3,The Kite Runner
3735,10,5084,2,The Sheltering Sky
3736,10,883,4,The Blind Assassin
3737,10,217,4,"The Devil in the White City: Murder, Magic, an..."
3738,10,103,5,The Count of Monte Cristo
...,...,...,...,...
5162625,10,638,4,A Visit from the Goon Squad
5162627,10,58,3,The Adventures of Huckleberry Finn
5520325,10,141,4,The Martian
5665508,10,2848,4,A God in Ruins


In [17]:
# Estimated rating of user 10 for book 1506

rating = svd.predict(10, 1206)
book_name=books['title'][books["book_id"]==1206].values[0]
print(f"User 10's approximate rating for the book '{book_name}' with id 1506 is {rating[3]:.1f}/5")

User 10's approximate rating for the book 'By the River Piedra I Sat Down and Wept' with id 1506 is 3.0/5


**Mixed Recommender**

In [18]:
# Returns 10 most similar books sorted by expected ratings for a particular user

def hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]

    df = books.iloc[book_indices][['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating']]
    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)
    df = df.sort_values('est', ascending=False)
    return df.head(n)

In [19]:
hybrid(4, 'Eat, Pray, Love', 5)

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
4593,4594,Love Warrior,2016,20094,4.1,4.169578
3705,3706,"The Opportunist (Love Me with Lies, #1)",2011,38511,4.22,4.060433
503,504,Redeeming Love,1991,157506,4.48,4.036928
4959,4960,Love Is a Dog from Hell,1977,19471,4.16,4.002153
8136,8137,F*ck Love,2015,12203,4.13,3.982231


In [20]:
hybrid(10, 'The Help', 5)

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
6595,6596,"Murder of Crows (The Others, #2)",2014,22994,4.31,4.435315
2742,2743,Nimona,2015,44825,4.2,4.347711
5448,5449,The Happiest Baby on the Block: The New Way to...,2002,13035,3.92,4.306917
5751,5752,"Preacher, Volume 9: Alamo",2001,17930,4.33,4.242499
2611,2612,"A Suitable Boy (A Suitable Boy, #1)",1993,33973,4.1,4.196108


In [21]:
# Including content based recommendations in our hybrid

def improved_hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]

    df = books.iloc[book_indices][['book_id', 'title', 'ratings_count', 'average_rating', 'original_publication_year']]
    v = df['ratings_count']
    m = df['ratings_count'].quantile(0.60)
    R = df['average_rating']
    C = df['average_rating'].mean()
    df['weighted_rating'] = (R*v + C*m) / (v + m)

    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)

    df['score'] = (df['est'] + df['weighted_rating']) / 2
    df = df.sort_values('score', ascending=False)
    return df[['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating', 'score']].head(n)

In [22]:
improved_hybrid(4, 'Eat, Pray, Love', 5)

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
503,504,Redeeming Love,1991,157506,4.48,4.211355
4593,4594,Love Warrior,2016,20094,4.1,4.087676
3705,3706,"The Opportunist (Love Me with Lies, #1)",2011,38511,4.22,4.076894
862,863,Guess How Much I Love You,1988,104690,4.36,4.074843
1129,1130,Ugly Love,2014,107583,4.3,4.01738


In [23]:
improved_hybrid(10, 'The Help', 5)

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
6595,6596,"Murder of Crows (The Others, #2)",2014,22994,4.31,4.269445
2742,2743,Nimona,2015,44825,4.2,4.223179
5751,5752,"Preacher, Volume 9: Alamo",2001,17930,4.33,4.167238
5448,5449,The Happiest Baby on the Block: The New Way to...,2002,13035,3.92,4.139527
2611,2612,"A Suitable Boy (A Suitable Boy, #1)",1993,33973,4.1,4.117499
