In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel


In [2]:
# read csv file

df = pd.read_csv("data/final_cleaned.csv")

In [3]:
# display dataframe

df.head()

Unnamed: 0.1,Unnamed: 0,title,authors,isbn,publisher,categories,thumbnail
0,0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,0439785960,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=QzI0B...
1,1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,0439358078,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=OIJ5B...
2,2,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,0439554896,Scholastic,Juvenile Fiction,http://books.google.com/books/content?id=h2Y-P...
3,3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,043965548X,Scholastic Inc.,Juvenile Fiction,http://books.google.com/books/content?id=IZN5B...
4,4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,0439682584,Scholastic,Juvenile Fiction,http://books.google.com/books/content?id=DAAAA...


In [4]:
# combine title, authors, & publisher columns into one

df['all'] = df['title'] + df['authors'] + df['publisher'] + df['categories']

In [5]:
# set vectorizer as TFIDvectorizer from sklearn
# this will convert our collection of title/author/publisher documents into a matrix of features using the TF-IDF formula
# set to analyze words 

vectorizer = TfidfVectorizer(analyzer='word')

In [6]:
# fits & transforms using the column of title/author/publisher and the TFIDvectorizer
# returns a matrix of the documents & TF-IDF calculations

tfidf_all_content = vectorizer.fit_transform(df['all'])

In [7]:
tfidf_all_content

<5290x18488 sparse matrix of type '<class 'numpy.float64'>'
	with 52031 stored elements in Compressed Sparse Row format>

In [8]:
# comping cosine similarity matrix using linear_kernal of sklearn

cosine_similarity_all_content = linear_kernel(tfidf_all_content, tfidf_all_content)

In [9]:
# create new dataframe with reset index

books = df.reset_index(drop=True)

In [10]:
# create a series of the indexes

indices = pd.Series(books['title'].index)

In [14]:
input_title = "War and Peace"

input_array = books[books['title'] == input_title].index.values

input_index = input_array[0]

input_index

120

In [34]:
# Function to get the most similar books
def recommend(index, method):
    id = indices[index]
    similarity_scores = list(enumerate(method[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]
    
    #Get the books index
    books_index = [i[0] for i in similarity_scores]
    
    titles = books['title'].iloc[books_index]
    authors = books['authors'].iloc[books_index]
    
    #a_zip = zip(titles, authors)
    #data = list(a_zip)
    
    data = pd.DataFrame(list(zip(titles, authors)), 
               columns =['Title', 'Authors'])
  
    return data
    
    #Return the top 10 most similar books
    #return books['title'].iloc[books_index]

In [36]:
# pass the book index & the cosine similiarities

recommended_list = recommend(input_index, cosine_similarity_all_content)
recommended_list

Unnamed: 0,Title,Authors
0,Collected Shorter Fiction: Volume I,Leo Tolstoy/Aylmer Maude/Nigel J. Cooper
1,War and Peace and War: The Rise and Fall of Em...,Peter Turchin
2,Tolstoy: Anna Karenina,Anthony Thorlby
3,The Gardens of Emily Dickinson,Judith Farr/Louise Carter
4,The Last Wife of Henry VIII,Carolly Erickson
5,Paris Spleen,Charles Baudelaire/Louise Varèse
6,Sexus (The Rosy Crucifixion #1),Henry Miller
7,The Forever War (The Forever War #1),Joe Haldeman
8,Pride and Prejudice,Jane Austen
9,When I Feel Angry,Nancy Cote/Cornelia Maude Spelman


In [17]:
# print the title of the book passed to the recommender

print(books['title'].iloc[input_index])

War and Peace
