In [1]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow as tf
import tensorflow_hub as hub

# To silence warnings from TensorFlow
import os
import logging
import warnings;
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

# To load saved embeddings
import joblib

# To create webapp
import psutil


#######################################################################################
                            # Load functions
#######################################################################################

def load_embeddings():
    # Path to USE
    embed = hub.load('/media/einhard/Seagate Expansion Drive/3380_data/data/tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    sentence_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/reviewEmbeddings.pkl')
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionEmbeddings.pkl')
    return embed, sentence_array, descriptions_array

def data_loader(datapath, books_file, reviews_file):
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1)
    reviews = pd.read_csv(datapath + reviews_file).drop('Unnamed: 0', axis=1)
    return books, reviews


                    # Return recommendations based on reviews
def find_reviews(query,reviews, n_results=5):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, sentence_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results+1)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(reviews.review_text.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar reviews: \n\n')
    #print(*top_n_list, sep='\n\n')
    #print(top_n_indices)
    return top_n_indices

def find_books(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, n_results)
    return books[books.book_id.isin(reviews.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('').reset_index().drop('index', axis=1)


                    # Return recommendations based on descriptions

def find_description(query, books, n_results=10):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, descriptions_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(books.description.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar book descriptions: \n\n')
    #print(*top_n_list, sep='\n\n')
    return top_n_indices

def find_books_description(query, reviews, books):
    top_n_indices = find_description(query)
    return books[books.book_id.isin(books.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('')

def show_recommendations(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, n_results)
    book_recommends = find_books(query, reviews, books, n_results)
    book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)

    for idx, i in enumerate(reviews.iloc[top_n_indices].index):
        print(idx)
        print(i)
        print('Book title:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
        print('Author:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
        print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
        print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
        print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
        print('\n\n')



#######################################################################################
                            # Load variables and data
#######################################################################################

# Path to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/'

# Books and reviews file names and loading
books_file = 'filtered_books.csv'
reviews_file = 'filtered_reviews.csv'

books, reviews = data_loader(datapath, books_file, reviews_file)
embed, sentence_array, descriptions_array = load_embeddings()


#######################################################################################
                                # Web App
#######################################################################################

In [2]:
show_recommendations('I love hard scifi', reviews, books, 15)

0
327691
Book title: Ancillary Justice (Imperial Radch, #1)
Author: Ann Leckie
Weighted Score: 3.9699322483318085
Similar review: #wickedawesome I love me some epic sci-fi
Goodreads Link: https://www.goodreads.com/book/show/17333324.Ancillary Justice 



1
210875
Book title: Ready Player One
Author: Ernest Cline
Weighted Score: 4.3094569028261365
Similar review: I love SciFi but could not get into this
Goodreads Link: https://www.goodreads.com/book/show/20603758.Ready Player One



2
248678
Book title: The Iron King (The Iron Fey, #1)
Author: Julie Kagawa
Weighted Score: 3.9199984214184576
Similar review: i like a little sci fi with my fantasy
Goodreads Link: https://www.goodreads.com/book/show/6644117.The Iron King 



3
107553
Book title: The Door Into Summer
Author: Robert A. Heinlein
Weighted Score: 3.99972628465949
Similar review: One of my all time favorite sci-fi books and I'm not a sci-fi fan.
Goodreads Link: https://www.goodreads.com/book/show/348.The Door Into Summer



4
164

In [93]:
res = find_books('I love hard scifi', reviews, books, 15)

In [103]:
top_n_indices = find_reviews('I love hard scifi', reviews, 15)
book_recommends = find_books('I love hard scifi', reviews, books, 15)
book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)

for idx, i in enumerate(reviews.iloc[top_n_indices].index):
    print(idx)
    print(i)
    print('Book title:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
    print('Author:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
    print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
    print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
    print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
    print('\n\n')

0
327691
Book title: Ancillary Justice (Imperial Radch, #1)
Author: Ann Leckie
Weighted Score: 3.9699322483318085
Similar review: #wickedawesome I love me some epic sci-fi
Goodreads Link: https://www.goodreads.com/book/show/17333324.Ancillary Justice 



1
210875
Book title: Ready Player One
Author: Ernest Cline
Weighted Score: 4.3094569028261365
Similar review: I love SciFi but could not get into this
Goodreads Link: https://www.goodreads.com/book/show/20603758.Ready Player One



2
248678
Book title: The Iron King (The Iron Fey, #1)
Author: Julie Kagawa
Weighted Score: 3.9199984214184576
Similar review: i like a little sci fi with my fantasy
Goodreads Link: https://www.goodreads.com/book/show/6644117.The Iron King 



3
107553
Book title: The Door Into Summer
Author: Robert A. Heinlein
Weighted Score: 3.99972628465949
Similar review: One of my all time favorite sci-fi books and I'm not a sci-fi fan.
Goodreads Link: https://www.goodreads.com/book/show/348.The Door Into Summer



4
164

In [61]:
def find_url(query, revires, books, n_results=5):
    book_data = find_books(query, reviews, books, n_results)[['title', 'book_id']]
    book_data['for_url'] = book_data['book_id'].astype(str) + '.' + book_data['title'].replace(r'\(.*$', '', regex = True)
    return book_data
url =  find_url('I love hard scifi', reviews, books, 10)

In [108]:
def show_recommendations(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, 15)
    book_recommends = find_books(query, reviews, books, 15)
    book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)

    for idx, i in enumerate(reviews.iloc[top_n_indices].index):
        print(idx)
        print(i)
        print('Book title:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
        print('Author:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
        print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
        print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
        print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
        print('\n\n')

In [110]:
show_recommendations('I loved the politics and violence', reviews, books, 15)

0
137783
Book title: Scent of Magic (Healer, #2)
Author: Maria V. Snyder
Weighted Score: 4.12936146177244
Similar review: I like more of the politics and less of the actual war.
Goodreads Link: https://www.goodreads.com/book/show/12027429.Scent of Magic 



1
147476
Book title: Cinder (The Lunar Chronicles, #1)
Author: Marissa Meyer
Weighted Score: 4.149973453558986
Similar review: loved the politics and ethical issues
Goodreads Link: https://www.goodreads.com/book/show/11235712.Cinder 



2
278930
Book title: The Blade Itself (The First Law, #1)
Author: Joe Abercrombie
Weighted Score: 4.139888082439458
Similar review: I really loved it, the characters are so dark, violent.
Goodreads Link: https://www.goodreads.com/book/show/944073.The Blade Itself 



3
281692
Book title: The Story of the Lost Child (The Neapolitan Novels, #4)
Author: Elena Ferrante
Weighted Score: 4.388992427495838
Similar review: I enjoyed the series but the violence can be disturbing.
Goodreads Link: https://www.go

In [64]:
top_n_indices = find_reviews('I love hard scifi', reviews, 10)
url = find_url('I love hard scifi', reviews, books, 10)

for i in reviews.iloc[top_n_indices].index:
    print('Book title:', res[res.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
    print('Author:', res[res.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
    print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
    print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
    print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + url[url.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
    print('\n\n')

Book title: Ancillary Justice (Imperial Radch, #1)
Author: Ann Leckie
Weighted Score: 3.9699322483318085
Similar review: #wickedawesome I love me some epic sci-fi
Goodreads Link: https://www.goodreads.com/book/show/17333324.Ancillary Justice 



Book title: Ready Player One
Author: Ernest Cline
Weighted Score: 4.3094569028261365
Similar review: I love SciFi but could not get into this
Goodreads Link: https://www.goodreads.com/book/show/20603758.Ready Player One



Book title: The Iron King (The Iron Fey, #1)
Author: Julie Kagawa
Weighted Score: 3.9199984214184576
Similar review: i like a little sci fi with my fantasy
Goodreads Link: https://www.goodreads.com/book/show/6644117.The Iron King 



Book title: The Door Into Summer
Author: Robert A. Heinlein
Weighted Score: 3.99972628465949
Similar review: One of my all time favorite sci-fi books and I'm not a sci-fi fan.
Goodreads Link: https://www.goodreads.com/book/show/348.The Door Into Summer



Book title: Ender's Game (Ender's Saga, #

In [40]:
res['for_url'] = res['book_id'].astype(str) + '.' + res['title'].replace(r'\(.*$', '', regex = True)

In [41]:
res

Unnamed: 0,title,name,description,weighted_score,book_id,for_url
0,Ready Player One,Ernest Cline,A world at stake.\nA quest for the ultimate pr...,4.309457,20603758,20603758.Ready Player One
1,"Ender's Game (Ender's Saga, #1)",Orson Scott Card,"Andrew ""Ender"" Wiggin thinks he is playing com...",4.299977,375802,375802.Ender's Game
2,"Wool Omnibus (Silo, #1)",Hugh Howey,This Omnibus Edition collects the five Wool bo...,4.239805,13453029,13453029.Wool Omnibus
3,"Obsidian (Lux, #1)",Jennifer L. Armentrout,Starting over sucks.\nWhen we moved to West Vi...,4.229906,12578077,12578077.Obsidian
4,"Hyperion (Hyperion Cantos, #1)",Dan Simmons,"On the world called Hyperion, beyond the law o...",4.209882,77566,77566.Hyperion
5,The Dispossessed,Ursula K. Le Guin,"Shevek, a brilliant physicist, decides to take...",4.179725,13651,13651.The Dispossessed
6,"Spin (Spin, #1)",Robert Charles Wilson,One night in October when he was ten years old...,4.009821,910863,910863.Spin
7,The Door Into Summer,Robert A. Heinlein,"It is 1970, and electronics engineer Dan Davis...",3.999726,348,348.The Door Into Summer
8,"Ancillary Justice (Imperial Radch, #1)",Ann Leckie,"On a remote, icy planet, the soldier known as ...",3.969932,17333324,17333324.Ancillary Justice
9,"The Iron King (The Iron Fey, #1)",Julie Kagawa,Meghan Chase has a secret destiny; one she cou...,3.919998,6644117,6644117.The Iron King


In [9]:
for idx, i in enumerate(res.book_id):
    print(res[res.book_id==i][['title','name']])

              title          name
0  Ready Player One  Ernest Cline
                             title              name
1  Ender's Game (Ender's Saga, #1)  Orson Scott Card
                     title        name
2  Wool Omnibus (Silo, #1)  Hugh Howey
                title                    name
3  Obsidian (Lux, #1)  Jennifer L. Armentrout
                            title         name
4  Hyperion (Hyperion Cantos, #1)  Dan Simmons
              title               name
5  The Dispossessed  Ursula K. Le Guin
             title                   name
6  Spin (Spin, #1)  Robert Charles Wilson
                  title                name
7  The Door Into Summer  Robert A. Heinlein
                                    title        name
8  Ancillary Justice (Imperial Radch, #1)  Ann Leckie
                              title          name
9  The Iron King (The Iron Fey, #1)  Julie Kagawa


import pandas as pd
test = pd.read_json('/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/20603758.Ready Player One.json')

# Creating cluster function

In [1]:
books[books.title.isin(['Children of Time'])].book_id.tolist()

NameError: name 'books' is not defined

In [9]:
from sklearn.cluster import KMeans

In [3]:
titles_list = ['Children of Time']
input_books = reviews[reviews.book_id.isin(books[books.title.isin(titles_list)].book_id.tolist())]

NameError: name 'reviews' is not defined

In [2]:
input_books

NameError: name 'input_books' is not defined

In [29]:
def load_sentences(review_texts):
    '''
    Function to load and embed a book's sentences
    '''
    # Copy sentence column to new variable
    sentences = input_books['review_text']

    # Vectorize sentences
    sentence_vectors = embed(sentences)
    
    return sentences, sentence_vectors

def get_clusters(sentences, sentence_vectors, k, n):
    '''
    Function to extract the n most representative sentences from k clusters
    '''

    # Instantiate the model
    kmeans_model = KMeans(n_clusters=k)

    # Fit the model
    kmeans_model.fit(sentence_array)
    
    # Loop through number of clusters
    for i in range(k):

        # Define cluster centre
        centre = kmeans_model.cluster_centers_[i]

        # Calculate inner product of cluster centre and sentence vectors
        ips = np.inner(centre, sentence_vectors)

        # Find the sentences with the highest inner products
        top_indices = pd.Series(ips).nlargest(n).index
        top_sentences = list(sentences.iloc[top_indices])

        print(f'Cluster #{i+1} sentences:\n')
        print(*top_sentences, sep='\n\n')
        print('\n')

In [28]:
book_sentences, book_review_vectors = load_sentences(input_books)

In [31]:
get_clusters(book_sentences, book_review_vectors, k=5, n=8)

Cluster #1 sentences:

Really liked this one. Great world building

Very enjoyable book. Unexpected ending. Setting the stage for more.

Brilliant! I wish there were sequels!

Pretty much blew my mind and got me thinking about everything I thought I knew...

A very unique and well written story.

A great story, a very quick read

I was surprised at how good this was. 
 That is not something I experience very often.

Excellent! Can't wait to read more books by this author!!


Cluster #2 sentences:

Excellent! Can't wait to read more books by this author!!

Brilliant! I wish there were sequels!

Good read, reminded me of Vernor Vinge and his Zones of thought series.

Very enjoyable book. Unexpected ending. Setting the stage for more.

Really liked this one. Great world building

Excellent novel, interesting concepts that make you think.

A very unique and well written story.

A great story, a very quick read


Cluster #3 sentences:

Excellent! Can't wait to read more books by this author