In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/PostBooksEDA.csv', index_col=0)
df_encoded = pd.read_csv('../data/PostEncodedBooksEDA.csv', index_col=0)

# Content Based Recommendations

These recommendation engines are built around the idea if a user likes some item (or a particular basket of items) then they will like similar items based on the item content/description. If I watched the Avengers, then I probably would want to watch other superhero-themed movies. If we look at a few books along with their descriptions:

In [3]:
df_descriptions = df['book_title'].value_counts()
df_descriptions.head(10)

book_title
Wild Animus                                        2365
The Lovely Bones: A Novel                          1202
The Da Vinci Code                                   868
A Painted House                                     793
The Nanny Diaries: A Novel                          787
Bridget Jones's Diary                               772
The Secret Life of Bees                             740
Divine Secrets of the Ya-Ya Sisterhood: A Novel     714
The Red Tent (Bestselling Backlist)                 684
Angels & Demons                                     654
Name: count, dtype: int64

As our dataset have 1M rows and many unique titles we are going to use a sample of our dataframse

In [4]:
df = df.sample(n=320000, random_state=42)  # random_state for reproducibility

Because each one of our rows represents a review with not an unique book title we are going to group the dataframe by book_title to use the vectorizer.

In [5]:
df_filtered = df[df['rating'] > 0]

unique_titles = df_filtered.groupby('book_title').agg(
                            review_count=('rating', 'count'),
                            avg_review_score=('rating', 'mean')).reset_index()

In [6]:
vectorizer = TfidfVectorizer(stop_words = "english", min_df=2)
TF_IDF_matrix = vectorizer.fit_transform(unique_titles['book_title'])

In [7]:
TF_IDF_matrix.shape

(59261, 16665)

In [8]:
TF_IDF_matrix

<59261x16665 sparse matrix of type '<class 'numpy.float64'>'
	with 223396 stored elements in Compressed Sparse Row format>

In [9]:
unique_titles[unique_titles['book_title'].str.contains('Harry Potter', na=False)]

Unnamed: 0,book_title,review_count,avg_review_score
17953,Garri Potter i uznik Azkabana (Harry Potter an...,1,10.0
19682,Harry Potter E il Calice Di Fuoco / Harry Pott...,1,8.0
19683,Harry Potter E il Prigioniero D'Azkaban,1,8.0
19684,Harry Potter E la Camera Dei Segreti,2,6.0
19685,Harry Potter Et LA Coupe De Feu,3,7.0
19686,Harry Potter Schoolbooks Box Set: Two Classic ...,1,10.0
19687,Harry Potter Schoolbooks: Quidditch Through th...,3,7.0
19688,Harry Potter Und Der Feuerkelch,6,9.666667
19689,Harry Potter Y LA Camara Secreta,1,9.0
19690,Harry Potter and the Chamber of Secrets,1,10.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

book_1 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)').values,  ]
book_2 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Prisoner of Azkaban (Book 3)').values,  ]

print("Similarity:", cosine_similarity(book_1, book_2)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index   
                                                  

Similarity: [[0.46655868]]


Not only can we use the sklearn.metrics.pairwise.cosine_similiarity function to compute that between two different vectors, we can pass the entire tf-idf matrix into the function as a single argument and it will compute the similarity between each column and every other column, giving back a square matrix, where the entry at is the similarity between movie and (like a correlation matrix for features).

In [11]:
similarities = cosine_similarity(TF_IDF_matrix, dense_output=False)

In [12]:
# Check the shape
# rows and columns should be equal, and the number of movies we started with (rows)
similarities.shape

(59261, 59261)

Now that we can directly compare two movies and we can make recommendations of the form: if you like movie $a$ then you will also like movies $b$, $c$, $d$, $etc$.

We can do this just picking a candidate film and taking its column in the similarity matrix, and then finding those rows where the similarities are highest:

In [13]:
# Test with a sample movie
unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)']

Unnamed: 0,book_title,review_count,avg_review_score
19691,Harry Potter and the Chamber of Secrets (Book 2),98,8.72449


In [14]:
# Get the column based upon the index
unique_title_index = unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)'].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'book': unique_titles ['book_title'],
                       'similarity': np.array(similarities[unique_title_index, :].todense()).squeeze()})

In [15]:
# Return the top 10 most similar movies
sim_df.sort_values(by='similarity', ascending=True).head(10)

Unnamed: 0,book,similarity
0,A Light in the Storm: The Civil War Diary of ...,0.0
38715,Shutterbug Follies (Doubleday Graphic Novels),0.0
38716,Shuttlecock,0.0
38717,Shy Boy : The Horse That Came in From The WIld,0.0
38718,Shy Charlene and Sharyl,0.0
38719,Shy Little Kitten's Secret Place,0.0
38720,Shylock's Daughter,0.0
38721,Shyness : A Bold New Approach,0.0
38722,Si Hubiera UN Manana/If Tomorrow Comes,0.0
38723,Si Le Grain Ne Meurt: Memoires (Folio Series: ...,0.0


In [16]:
def content_recommender(title, books, similarities, vote_threshold=10) :

    # Get the movie by the title
    book_index = books[books['book_title'] == title].index

    # Create a dataframe with the movie titles
    sim_df = pd.DataFrame(
        {'book': books['book_title'],
         'similarity': np.array(similarities[book_index, :].todense()).squeeze(),
         'Number of reviews': books['review_count'],
         'Avg Rating': books['avg_review_score']
        })

    # Get the top 10 movies with > 10 votes
    top_books = sim_df[sim_df['Number of reviews'] > vote_threshold].sort_values(by='similarity', ascending=False).head(10)

    return top_books

In [19]:
# Test the recommender
similar_movies = content_recommender("Harry Potter and the Chamber of Secrets (Book 2)", unique_titles, similarities, vote_threshold=10)
similar_movies.head(10)

Unnamed: 0,book,similarity,Number of reviews,Avg Rating
19691,Harry Potter and the Chamber of Secrets (Book 2),1.0,98,8.72449
43852,The Chamber,0.584344,71,7.323944
19708,Harry Potter and the Sorcerer's Stone (Harry P...,0.570851,85,8.917647
19696,Harry Potter and the Goblet of Fire (Book 4),0.524118,73,9.287671
19706,Harry Potter and the Sorcerer's Stone (Book 1),0.497518,60,8.916667
19699,Harry Potter and the Order of the Phoenix (Boo...,0.496432,64,9.0625
19703,Harry Potter and the Prisoner of Azkaban (Book 3),0.466559,99,8.89899
17542,From Potter's Field,0.340878,30,7.933333
48839,The Murder Book,0.136553,32,7.90625
43337,The Book of Ruth (Oprah's Book Club (Paperback)),0.116924,39,7.153846
