In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel


In [2]:
# read csv file

df = pd.read_csv("data/cleaned_books.csv")

In [3]:
# display dataframe

df.head()

Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,ratings_count,text_reviews_count,publication_date,publisher,categories,thumbnail,description,published_year,num_pages
0,0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780440000000.0,eng,2095690,27591,9/16/2006,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=QzI0B...,When Harry Potter and the Half-Blood Prince op...,2015.0,652.0
1,1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780440000000.0,eng,2153167,29221,9/1/2004,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=OIJ5B...,"In Harry Potter and the Order of the Phoenix, ...",2015.0,870.0
2,2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780440000000.0,eng,6333,244,11/1/2003,Scholastic,Juvenile Fiction,http://books.google.com/books/content?id=h2Y-P...,When the Chamber of Secrets is opened again at...,2003.0,352.0
3,3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780440000000.0,eng,2339585,36325,5/1/2004,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=IZN5B...,"For twelve long years, the dread fortress of A...",2015.0,435.0
4,4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780440000000.0,eng,41428,164,9/13/2004,Scholastic,Juvenile Fiction,http://books.google.com/books/content?id=DAAAA...,The first five years of Harry Potter magic are...,2004.0,2690.0


In [16]:
# combine title, authors, & publisher columns into one

df['all'] = df['title'] + df['authors'] + df['publisher'] + df['categories']

In [17]:
# set vectorizer as TFIDvectorizer from sklearn
# this will convert our collection of title/author/publisher documents into a matrix of features using the TF-IDF formula
# set to analyze words 

vectorizer = TfidfVectorizer(analyzer='word')

In [18]:
# fits & transforms using the column of title/author/publisher and the TFIDvectorizer
# returns a matrix of the documents & TF-IDF calculations

tfidf_all_content = vectorizer.fit_transform(df['all'])

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [10]:
tfidf_all_content

<5680x18102 sparse matrix of type '<class 'numpy.float64'>'
	with 52904 stored elements in Compressed Sparse Row format>

In [11]:
# comping cosine similarity matrix using linear_kernal of sklearn
cosine_similarity_all_content = linear_kernel(tfidf_all_content, tfidf_all_content)

In [12]:
books = df.reset_index(drop=True)

In [13]:
indices = pd.Series(books['title'].index)

In [20]:
#Function to get the most similar books
def recommend(index, method):
    id = indices[index]
    similarity_scores = list(enumerate(method[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]
    
    #Get the books index
    books_index = [i[0] for i in similarity_scores]
    
    #Return the top 10 most similar books
    return books['title'].iloc[books_index]

In [21]:
recommend(125, cosine_similarity_all_content)

54                                          Anna Karenina
2577                                        War and Peace
2575                                        War and Peace
2506                  Collected Shorter Fiction: Volume I
55                                 Tolstoy: Anna Karenina
1648                       The Gardens of Emily Dickinson
2578    War and Peace and War: The Rise and Fall of Em...
2042                                        Anna Karenina
3307                                         Paris Spleen
2936                The Forever War (The Forever War  #1)
Name: title, dtype: object