In [28]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 100)

In [29]:
books = pd.read_csv('data/books_data.csv')

books['id'] = books.index
# books = books[['id', 'Title', 'description', 'authors', 'image', 'publisher', 'categories', 'ratingsCount']]
# books.columns = ['id', 'title', 'description', 'author', 'image', 'publisher', 'genre', 'ratings_count']

display(books.head())
display(books.shape)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,id
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],,0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,1
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],,2
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],,3
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,,4


(212404, 11)

In [30]:
books.isnull().sum()

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
id                    0
dtype: int64

In [31]:
books.dropna(inplace=True)
books.reset_index()
books.shape

(40635, 11)

In [32]:
def clean_authors(authors):
    authors = re.sub(r'\[|\]', '', authors)
    authors = authors.split(',')
    authors = [re.sub(r'\'|\"', '', author) for author in authors]

    return authors[0]

def clean_genre(genre):
    genre = re.sub(r'\[|\]', '', genre)
    genre = genre.split(',')
    genre = [re.sub(r'\'|\"', '', g) for g in genre]
    return genre[0]

In [33]:
books['author'] = books['author'].apply(clean_authors)
books['genre'] = books['genre'].apply(clean_genre)
books.head()

KeyError: 'author'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
def clean_title(title):
    title = re.sub('[^A-Za-z0-9]+' , ' ', title)
    title = title.lower()
    return title

In [None]:
# clean the title by lowercasing and removing characters that are not letters or numbers
books['clean_title'] = books['title'].apply(clean_title)
books['clean_description'] = books['description'].apply(clean_title)
books.head()

Unnamed: 0,id,title,description,author,image,publisher,genre,ratings_count,clean_title,clean_description
0,0,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,http://books.google.com/books/content?id=kVqRa...,Wm. B. Eerdmans Publishing,Religion,5.0,the church of christ a biblical ecclesiology f...,in the church of christ a biblical ecclesiolog...
1,1,Voices from the Farm: Adventures in Community ...,"Twenty-five years ago, at the height of the co...",Rupert Fike,http://books.google.com/books/content?id=IjTAB...,Book Publishing Company,Biography & Autobiography,1.0,voices from the farm adventures in community l...,twenty five years ago at the height of the cou...
2,2,The Battleship Bismarck,The Bismarck is perhaps the most famous – and ...,Stefan Draminski,http://books.google.com/books/content?id=nxttD...,Bloomsbury Publishing,History,1.0,the battleship bismarck,the bismarck is perhaps the most famous and no...
3,3,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",May Mcgoldrick,http://books.google.com/books/content?id=VmCRS...,Harper Collins,Juvenile Fiction,2.0,tess and the highlander,in 1543 on a windswept isle off of scotland se...
4,4,Beginner's Yoruba (Hippocrene Beginner's Series),"""Beginner's Yoruba"" is now available with two ...",Kayode J. Fakinlede,http://books.google.com/books/content?id=xLe4n...,Hippocrene Books,Foreign Language Study,1.0,beginner s yoruba hippocrene beginner s series,beginner s yoruba is now available with two a...


In [None]:
books.to_csv('data/books_clean.csv', index=False)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2)
tfidf_matrix = vectorizer.fit_transform(books['clean_title'])

vectorizer_desc = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2)
desc_tfidf_matrix = vectorizer_desc.fit_transform(books['clean_description'])

In [None]:
display(pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()).head())

MemoryError: Unable to allocate 7.76 GiB for an array with shape (40681, 25614) and data type float64

In [None]:
pickle.dump(tfidf_matrix, open('models/tfidf_matrix.pickle', 'wb'))
pickle.dump(desc_tfidf_matrix, open('models/desc_tfidf_matrix.pickle', 'wb'))

In [None]:
title = 'The Alchemist'

# get recommendations based on title and rating
def get_recommendations(title, tfidf_matrix, books):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    indices = similarity.argsort()[::-1]
    books = books.iloc[indices]
    return books

get_recommendations(title, tfidf_matrix, books).head(10)

Unnamed: 0,id,title,description,author,image,publisher,genre,ratings_count,clean_title,clean_description
26834,26834,The Alchemist,"""My heart is afraid that it will have to suffe...",Paulo Coelho,http://books.google.com/books/content?id=pTr44...,Harper Collins,Fiction,4895.0,the alchemist,my heart is afraid that it will have to suffe...
108225,108225,The Last Alchemist,"Spinifex, the nineteenth alchemist, must disco...",Colin Thompson,http://books.google.com/books/content?id=PhnOf...,Random House Australia,Alchemists,4.0,the last alchemist,spinifex the nineteenth alchemist must discove...
21523,21523,The Alchemist's Door,"Fleeing to Prague to escape a demon attack, si...",Lisa Goldstein,http://books.google.com/books/content?id=WrS1F...,Macmillan,Fiction,5.0,the alchemist s door,fleeing to prague to escape a demon attack six...
169400,169400,The Alchemist's Daughter: A Novel,Based on some of literature’s horror and scien...,Theodora Goss,http://books.google.com/books/content?id=LicvD...,Simon and Schuster,Fiction,37.0,the alchemist s daughter a novel,based on some of literature s horror and scien...
152507,152507,The Alchemist's Handbook: Manual for Practical...,"Formerly handed down under oath of secrecy, th...",Frater Albertus,http://books.google.com/books/content?id=-EGcb...,Weiser Books,Religion,1.0,the alchemist s handbook manual for practical ...,formerly handed down under oath of secrecy thi...
79469,79469,The Alchemist to Mercury: An Alternate Opus: U...,"Since 1970, many readers have waited for anoth...",Douglas Messerli,http://books.google.com/books/content?id=drBZA...,Los Angeles : Sun & Moon Press,Poetry,1.0,the alchemist to mercury an alternate opus unc...,since 1970 many readers have waited for anothe...
65711,65711,Shipwrecks,"Living in a remote, desperately poor fishing v...",Akira Yoshimura,http://books.google.com/books/content?id=k0fQb...,Houghton Mifflin Harcourt,Fiction,17.0,shipwrecks,living in a remote desperately poor fishing vi...
65742,65742,".Hack: //Legend of the Twilight, Vol. 1","The CC Corporation thinks that Shugo, Rena and...",Rei Idumi,http://books.google.com/books/content?id=tFAUR...,TokyoPop,Comics & Graphic Novels,5.0,hack legend of the twilight vol 1,the cc corporation thinks that shugo rena and ...
65732,65732,Charcuterie and French Pork Cookery,This timeless classic of French cuisine brings...,Jane Grigson,http://books.google.com/books/content?id=HC0Sv...,Grub Street Cookery,Cooking,1.0,charcuterie and french pork cookery,this timeless classic of french cuisine brings...
65728,65728,"Narrative of the Life of Frederick Douglass, A...",Narrative of the Life of Frederick Douglass Fi...,FREDERICK DOUGLASS,http://books.google.com/books/content?id=UP4yE...,BEYOND BOOKS HUB,Biography & Autobiography,51.0,narrative of the life of frederick douglass an...,narrative of the life of frederick douglass fi...


In [None]:
the_alchemist = books[books['title'] == 'The Alchemist']

def get_recommendations_by_description(description, desc_tfidf_matrix, books):
    description = clean_title(description)
    query_vec = vectorizer_desc.transform([description])
    similarity = cosine_similarity(query_vec, desc_tfidf_matrix).flatten()
    indices = similarity.argsort()[::-1]
    books = books.iloc[indices]
    return books

get_recommendations_by_description(the_alchemist['description'].values[0], desc_tfidf_matrix, books).head(10)

Unnamed: 0,id,title,description,author,image,publisher,genre,ratings_count,clean_title,clean_description
26834,26834,The Alchemist,"""My heart is afraid that it will have to suffe...",Paulo Coelho,http://books.google.com/books/content?id=pTr44...,Harper Collins,Fiction,4895.0,the alchemist,my heart is afraid that it will have to suffe...
68200,68200,The Valkyries,A classic masterwork of spiritual tension and ...,Paulo Coelho,http://books.google.com/books/content?id=FdhkG...,Harper Collins,Fiction,16.0,the valkyries,a classic masterwork of spiritual tension and ...
44478,44478,The Sorceress & the Savage,While armies of the Shadowrealms gather and Ma...,Michael Scott,http://books.google.com/books/content?id=DnsNa...,Ember,Juvenile Fiction,47.0,the sorceress the savage,while armies of the shadowrealms gather and ma...
190773,190773,An Enemy at Green Knowe,The inhabitants of Green Knowe become involved...,Lucy Maria Boston,http://books.google.com/books/content?id=JiV8C...,Houghton Mifflin Harcourt,Juvenile Fiction,2.0,an enemy at green knowe,the inhabitants of green knowe become involved...
169689,169689,The Red Lion & The Elixir of Eternal Life,The harrowing adventures of a 16th century alc...,Maria Szepes,http://books.google.com/books/content?id=fvKKG...,Horus Pub Incorporated,Fiction,1.0,the red lion the elixir of eternal life,the harrowing adventures of a 16th century alc...
148791,148791,Adultery,I want to change. I need to change. I'm gradua...,Paulo Coelho,http://books.google.com/books/content?id=cxUcA...,Vintage,Fiction,2987.0,adultery,i want to change i need to change i m graduall...
161204,161204,Adultery & Other Choices,I want to change. I need to change. I'm gradua...,Paulo Coelho,http://books.google.com/books/content?id=cxUcA...,Vintage,Fiction,2987.0,adultery other choices,i want to change i need to change i m graduall...
147946,147946,Lexicon of Alchemy,Martinus Rulandus was a German physician and a...,Martin Rulandus,http://books.google.com/books/content?id=vWmjA...,Jazzybee Verlag,Body,1.0,lexicon of alchemy,martinus rulandus was a german physician and a...
3700,3700,The Abyss,The story of the fate of two cousins in sixtee...,Marguerite Yourcenar,http://books.google.com/books/content?id=z248v...,Macmillan,Fiction,1.0,the abyss,the story of the fate of two cousins in sixtee...
65266,65266,The Pilgrimage: A Contemporary Quest for Ancie...,"Previously published as ""The Diary of a Magus""...",Paulo Coelho,http://books.google.com/books/content?id=0NtNz...,HarperCollins UK,Christian pilgrims and pilgrimages,1.0,the pilgrimage a contemporary quest for ancien...,previously published as the diary of a magus t...
