In [6]:
# imports
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
from numpy.linalg import norm



In [17]:
#read in data test
loaded_array1 = np.load('../../data/NPZ/titleAndEncodedSummariesStrings1.npz', allow_pickle = True)
print((loaded_array1["arr_0"]))


In [1]:
#helper functions
def preProcessString(string: str):
    
    # remove stopwords
    word_tokens = word_tokenize(string)
    string = (" ".join([w for w in word_tokens if not w.lower() in stop_words]))

    # all lower case
    string = string.lower()
    # spell check
    word_tokens = word_tokenize(string)

    #skip since it takes too long
    # spell = Speller(lang='en')
    # string = (" ".join([spell(w) for w in word_tokens]))

    # stemming
    stemmer = PorterStemmer()
    string = (' '.join(stemmer.stem(token) for token in word_tokenize(string)))

    return string	

def preProcessListOfStrings(strings : list):
    return [preProcessString(string) for string in strings]



In [10]:

#for all summaries in each file, store preprocessed and vectorized summaries in new numpy files

def saveVectorizedSummaries():
    """save all book info and vectorized summary to 1 numpy file"""

    allBookTitles = []
    allBookIsbns = []
    #make an "empty" array we can concatenate to, then remove this first row before returning
    allVectorizedSummaries = np.zeros((1, 512))
    
    for i in range(1, 14): #13 files
        filepath = "../../data/NPZ/"
        filename = f"titleAndEncodedSummariesStrings{i}.npz"
        loaded_arrays = np.load(filepath + filename, allow_pickle = True)["arr_0"]

        # add book title + isbn so the list stays flattened
        for row in loaded_arrays[:, :2]:
            allBookTitles.append(row[0])
            allBookIsbns.append(row[1])
        
        preprocessedSummaries = preProcessListOfStrings(loaded_arrays[:, 2])
        vectorizedSummaries = embed(preprocessedSummaries)
        allVectorizedSummaries = np.concatenate((allVectorizedSummaries, vectorizedSummaries), axis = 0)

    #remove first row
    allVectorizedSummaries = allVectorizedSummaries[1:,:]

    np.save(f"data/allBookTitles.npy", allBookTitles)
    np.save(f"data/allBookIsbns.npy", allBookIsbns)
    np.save(f"data/allVectorizedBookSummaries.npy", allVectorizedSummaries)

# saveVectorizedSummaries()

In [51]:
#try saving summaries as npy
#NOTE:
#-> saving in npz takes same amount of space as saving in npy


In [8]:
#compare similarities between all books
def compareSimilarities():
    """compare similarities between all books. Store result in a matrix. Do not look at book titles here, but that can be inferred from the order"""

    allSummaries = np.load(f"data/allVectorizedBookSummaries.npy")
    similarities = np.dot(allSummaries, allSummaries.T)/(norm(allSummaries)*norm(allSummaries.T)) #cosine similarity
    np.save(f"data/similarities.npy", similarities)

compareSimilarities()

In [9]:
#make sure lengths are correct
# similarities = np.load("data/similarities.npy")
titles = np.load("data/allBookTitles.npy")
summaries = np.load("data/allVectorizedBookSummaries.npy")
isbns = np.load("data/allBookIsbns.npy")
print(titles.shape)
print(summaries.shape)
print(isbns.shape)

(27514,)
(27514, 512)
(27514,)


In [11]:
#make a function to input an isbn, which then checks which book it is, and returns the top n most similar books
def returnTopNSimilarBooksWithIsbn(isbn: str, n: int):
	"""input an isbn, which then checks which book it is, and returns the top n most similar books (their isbns)"""

	#get index of book
	isbns = np.load(f"data/allBookIsbns.npy")
	bookIndex = np.where(isbns == isbn)[0][0]

	# #get similarities
	similarities = np.load(f"data/similarities.npy")
	bookSimilarities = similarities[bookIndex] #all similarities for this book

	# #get top n most similar books
	topNIndices = (np.argsort(bookSimilarities)[::-1])[:n+1] #argsort, reverse order so most similar elements first, then take top n
	print(topNIndices)

	recommendedBookIsbns = []
	for idx in topNIndices:
		if (isbns[idx] != isbn):
			recommendedBookIsbns.append(isbns[idx])
	return recommendedBookIsbns	

def returnTopNSimilarBooksWithTitle(title: str, n: int):
	"""input a title, which then checks which book it is, and returns the top n most similar books (their titles)"""

	#get index of book
	titles = np.load(f"data/allBookTitles.npy")
	bookIndex = np.where(titles == title)[0][0]

	# #get similarities
	similarities = np.load(f"data/similarities.npy")
	bookSimilarities = similarities[bookIndex] #all similarities for this book

	# #get top n most similar books
	topNIndices = (np.argsort(bookSimilarities)[::-1])[:n+1] #argsort, reverse order so most similar elements first, then take top n

	recommendedBookTitles = []
	for idx in topNIndices:
		if (titles[idx] != title):
			recommendedBookTitles.append(titles[idx])
	return recommendedBookTitles	


# returnTopNSimilarBooks("9780820312132", 10) #num 1 (first)
# returnTopNSimilarBooks("9781504318402", 10) #num 5
returnTopNSimilarBooksWithTitle("Dune (novel)", 10) 

['Dune: House Harkonnen',
 'Hunters of Dune',
 'Kings of the Wyld',
 'Yavana Rani',
 'Marvel 1602',
 'Line of Delirium',
 'Hidden Warrior',
 'Cymbeline',
 'New Worlds (comics)',
 'Darkhouse']