In [1]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow as tf
import tensorflow_hub as hub

# To silence warnings from TensorFlow
import os
import logging
import warnings;
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

# To load saved embeddings
import joblib

# To create webapp
import psutil


#######################################################################################
                            # Load functions
#######################################################################################

def load_embeddings():
    # Path to USE
    embed = hub.load('/media/einhard/Seagate Expansion Drive/3380_data/data/tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    sentence_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/reviewEmbeddings.pkl')
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionEmbeddings.pkl')
    return embed, sentence_array, descriptions_array

def data_loader(datapath, books_file, reviews_file):
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1)
    reviews = pd.read_csv(datapath + reviews_file).drop('Unnamed: 0', axis=1)
    return books, reviews


                    # Return recommendations based on reviews
def find_reviews(query,reviews, n_results=5):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, sentence_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results+1)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(reviews.review_text.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar reviews: \n\n')
    #print(*top_n_list, sep='\n\n')
    #print(top_n_indices)
    return top_n_indices

def find_books(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, n_results)
    return books[books.book_id.isin(reviews.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('').reset_index().drop('index', axis=1)


                    # Return recommendations based on descriptions

def find_description(query, books, n_results=10):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, descriptions_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(books.description.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar book descriptions: \n\n')
    #print(*top_n_list, sep='\n\n')
    return top_n_indices

def find_books_description(query, reviews, books):
    top_n_indices = find_description(query)
    return books[books.book_id.isin(books.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('')

def show_recommendations(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, n_results)
    book_recommends = find_books(query, reviews, books, n_results)
    book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)
    return top_n_indices, book_recommends



#######################################################################################
                            # Load variables and data
#######################################################################################

# Path to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/'

# Books and reviews file names and loading
books_file = 'filtered_books.csv'
reviews_file = 'filtered_reviews.csv'

books, reviews = data_loader(datapath, books_file, reviews_file)
embed, sentence_array, descriptions_array = load_embeddings()


#######################################################################################
                                # Web App
#######################################################################################

In [7]:
top_n_indices, book_recommends = show_recommendations('cat', reviews, books, 50)

In [8]:
book_recommends

Unnamed: 0,title,name,description,weighted_score,book_id,for_url
0,"Saga, Vol. 2 (Saga, #2)",Brian K. Vaughan,From award-winning writer Brian K. Vaughan (Pr...,4.559449,17131869,"17131869.Saga, Vol. 2"
1,"The Darkest Hour (Warriors, #6)",Erin Hunter,"ThunderClan's darkest hour is upon them, as Ti...",4.518915,49006,49006.The Darkest Hour
2,Pete the Cat: Rocking in My School Shoes,Eric Litwin,Time to head back to school with this bestsell...,4.317648,9466024,9466024.Pete the Cat: Rocking in My School Shoes
3,"Into the Wild (Warriors, #1)",Erin Hunter,"For generations, four Clans of wild cats have ...",4.289629,111332,111332.Into the Wild
4,"This Side of the Grave (Night Huntress, #5)",Jeaniene Frost,Danger waits on both sides of the grave.\nWith...,4.279609,6871617,6871617.This Side of the Grave
5,"Saga, Vol. 1 (Saga, #1)",Brian K. Vaughan,When two soldiers from opposite sides of a nev...,4.239886,15704307,"15704307.Saga, Vol. 1"
6,Calling on Dragons (Enchanted Forest Chronicle...,Patricia C. Wrede,Those wicked wizards are back--and they've bec...,4.189605,169879,169879.Calling on Dragons
7,Where is the Green Sheep?,Mem Fox,"Here is the blue sheep, and here is the red sh...",4.187512,378164,378164.Where is the Green Sheep?
8,"Halfway to the Grave (Night Huntress, #1)",Jeaniene Frost,Flirting with the Grave...Half-vampire Catheri...,4.179877,1421990,1421990.Halfway to the Grave
9,Homer's Odyssey,Gwen Cooper,"Once in nine lives, something extraordinary ha...",4.178691,6261477,6261477.Homer's Odyssey


In [13]:
books[books.book_id.isin(reviews[reviews.index.isin(top_n_indices)].book_id.tolist())]

Unnamed: 0,title,author,average_rating,ratings_count,text_reviews_count,description,shelf_1,book_id,weighted_score,name
71,"Saga, Vol. 2 (Saga, #2)",24514.0,4.56,58474.0,4591.0,From award-winning writer Brian K. Vaughan (Pr...,,17131869,4.559449,Brian K. Vaughan
135,"The Darkest Hour (Warriors, #6)",27498.0,4.52,27856.0,891.0,"ThunderClan's darkest hour is upon them, as Ti...",warriors,49006,4.518915,Erin Hunter
1239,Pete the Cat: Rocking in My School Shoes,1644249.0,4.32,8567.0,540.0,Time to head back to school with this bestsell...,picture-books,9466024,4.317648,Eric Litwin
1548,"Into the Wild (Warriors, #1)",27498.0,4.29,50490.0,3954.0,"For generations, four Clans of wild cats have ...",currently-reading,111332,4.289629,Erin Hunter
1690,"This Side of the Grave (Night Huntress, #5)",669810.0,4.28,46590.0,1978.0,Danger waits on both sides of the grave.\nWith...,vampires,6871617,4.279609,Jeaniene Frost
2269,"Saga, Vol. 1 (Saga, #1)",24514.0,4.24,142640.0,8700.0,When two soldiers from opposite sides of a nev...,,15704307,4.239886,Brian K. Vaughan
3264,Calling on Dragons (Enchanted Forest Chronicle...,36122.0,4.19,34766.0,538.0,Those wicked wizards are back--and they've bec...,fantasy,169879,4.189605,Patricia C. Wrede
3376,Where is the Green Sheep?,2131.0,4.19,5483.0,326.0,"Here is the blue sheep, and here is the red sh...",picture-book,378164,4.187512,Mem Fox
3432,"Halfway to the Grave (Night Huntress, #1)",669810.0,4.18,108349.0,5496.0,Flirting with the Grave...Half-vampire Catheri...,,1421990,4.179877,Jeaniene Frost
3530,Homer's Odyssey,179790.0,4.18,10086.0,1207.0,"Once in nine lives, something extraordinary ha...",non-fiction,6261477,4.178691,Gwen Cooper


In [15]:
rec_df = pd.DataFrame()

for idx, i in enumerate(reviews.iloc[top_n_indices].index):
    rec_df.append(book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
    rec_df.append(book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
    rec_df.append(round(books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0], 2))
    rec_df.append(reviews[reviews.index == i].review_text.tolist()[0])

TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

In [93]:
res = find_books('I love hard scifi', reviews, books, 15)

In [103]:
top_n_indices = find_reviews('I love hard scifi', reviews, 15)
book_recommends = find_books('I love hard scifi', reviews, books, 15)
book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)

for idx, i in enumerate(reviews.iloc[top_n_indices].index):
    print(idx)
    print(i)
    print('Book title:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
    print('Author:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
    print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
    print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
    print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
    print('\n\n')

0
327691
Book title: Ancillary Justice (Imperial Radch, #1)
Author: Ann Leckie
Weighted Score: 3.9699322483318085
Similar review: #wickedawesome I love me some epic sci-fi
Goodreads Link: https://www.goodreads.com/book/show/17333324.Ancillary Justice 



1
210875
Book title: Ready Player One
Author: Ernest Cline
Weighted Score: 4.3094569028261365
Similar review: I love SciFi but could not get into this
Goodreads Link: https://www.goodreads.com/book/show/20603758.Ready Player One



2
248678
Book title: The Iron King (The Iron Fey, #1)
Author: Julie Kagawa
Weighted Score: 3.9199984214184576
Similar review: i like a little sci fi with my fantasy
Goodreads Link: https://www.goodreads.com/book/show/6644117.The Iron King 



3
107553
Book title: The Door Into Summer
Author: Robert A. Heinlein
Weighted Score: 3.99972628465949
Similar review: One of my all time favorite sci-fi books and I'm not a sci-fi fan.
Goodreads Link: https://www.goodreads.com/book/show/348.The Door Into Summer



4
164

In [61]:
def find_url(query, revires, books, n_results=5):
    book_data = find_books(query, reviews, books, n_results)[['title', 'book_id']]
    book_data['for_url'] = book_data['book_id'].astype(str) + '.' + book_data['title'].replace(r'\(.*$', '', regex = True)
    return book_data
url =  find_url('I love hard scifi', reviews, books, 10)

In [18]:
def show_recommendations(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, 15)
    book_recommends = find_books(query, reviews, books, 15)
    book_recommends['for_url'] = book_recommends['book_id'].astype(str) + '.' + book_recommends['title'].replace(r'\(.*$', '', regex = True)
    common_titles = []
    for idx, i in enumerate(reviews.iloc[top_n_indices].index):
        common_titles.append(book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
        print(idx)
        print(i)
        print('Book title:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
        print('Author:', book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
        print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
        print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
        print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
        print('\n\n')
    return common_titles

In [21]:
common_titles = show_recommendations('cat', reviews, books, 50)

0
386512
Book title: This Side of the Grave (Night Huntress, #5)
Author: Jeaniene Frost
Weighted Score: 4.279608603987263
Similar review: Cat is particularly badass in this one.
Goodreads Link: https://www.goodreads.com/book/show/6871617.This Side of the Grave 



1
282434
Book title: Into the Wild (Warriors, #1)
Author: Erin Hunter
Weighted Score: 4.289628682013339
Similar review: It was about cats. I love cats.
Goodreads Link: https://www.goodreads.com/book/show/111332.Into the Wild 



2
412833
Book title: How to Tell If Your Cat Is Plotting to Kill You
Author: Matthew Inman
Weighted Score: 3.9499246100014016
Similar review: Cats are awesome, especially Bob Cats!
Goodreads Link: https://www.goodreads.com/book/show/15799936.How to Tell If Your Cat Is Plotting to Kill You



3
310544
Book title: Where is the Green Sheep?
Author: Mem Fox
Weighted Score: 4.187512349310434
Similar review: Cat introduced me to the green sheep! Thanks Cat!
Goodreads Link: https://www.goodreads.com/book/sho

In [31]:
most_common = Counter(common_titles).most_common(15)
for title, rank in most_common:
    if rank > 1:
        print(f'Title: {title}, \nAppeared: {rank} times')
    

Title: How to Tell If Your Cat Is Plotting to Kill You, 
Appeared: 3 times
Title: Into the Wild (Warriors, #1), 
Appeared: 2 times
Title: I Could Pee on This: And Other Poems by Cats, 
Appeared: 2 times


In [44]:
sidetable = pd.DataFrame(most_common, index=[x + 1 for x in range(len(most_common))]).rename(columns={0:'Title', 1:'No. of appearences'})
sidetable.iloc[0]['No. of appearences']

3

In [None]:
def common_results(most_common):
    for title, rank in most_common:
        if rank > 1:
            a

In [17]:
top_n_indices = find_reviews('I love hard scifi', reviews, 10)
url = find_url('I love hard scifi', reviews, books, 10)
common_titles = []
for i in reviews.iloc[top_n_indices].index:
    common_titles.append(book_recommends[book_recommends.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])a
    print('Book title:', res[res.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].title.tolist()[0])
    print('Author:', res[res.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].name.tolist()[0])
    print('Weighted Score:', books[books.book_id.isin(reviews[reviews.index == i].book_id.tolist())].weighted_score.tolist()[0])
    print('Similar review:', reviews[reviews.index == i].review_text.tolist()[0])
    print('Goodreads Link:', 'https://www.goodreads.com/book/show/' + url[url.book_id == (reviews[reviews.index == i].book_id.tolist()[0])].for_url.tolist()[0])
    print('\n\n')

NameError: name 'find_url' is not defined

In [40]:
res['for_url'] = res['book_id'].astype(str) + '.' + res['title'].replace(r'\(.*$', '', regex = True)

In [41]:
res

Unnamed: 0,title,name,description,weighted_score,book_id,for_url
0,Ready Player One,Ernest Cline,A world at stake.\nA quest for the ultimate pr...,4.309457,20603758,20603758.Ready Player One
1,"Ender's Game (Ender's Saga, #1)",Orson Scott Card,"Andrew ""Ender"" Wiggin thinks he is playing com...",4.299977,375802,375802.Ender's Game
2,"Wool Omnibus (Silo, #1)",Hugh Howey,This Omnibus Edition collects the five Wool bo...,4.239805,13453029,13453029.Wool Omnibus
3,"Obsidian (Lux, #1)",Jennifer L. Armentrout,Starting over sucks.\nWhen we moved to West Vi...,4.229906,12578077,12578077.Obsidian
4,"Hyperion (Hyperion Cantos, #1)",Dan Simmons,"On the world called Hyperion, beyond the law o...",4.209882,77566,77566.Hyperion
5,The Dispossessed,Ursula K. Le Guin,"Shevek, a brilliant physicist, decides to take...",4.179725,13651,13651.The Dispossessed
6,"Spin (Spin, #1)",Robert Charles Wilson,One night in October when he was ten years old...,4.009821,910863,910863.Spin
7,The Door Into Summer,Robert A. Heinlein,"It is 1970, and electronics engineer Dan Davis...",3.999726,348,348.The Door Into Summer
8,"Ancillary Justice (Imperial Radch, #1)",Ann Leckie,"On a remote, icy planet, the soldier known as ...",3.969932,17333324,17333324.Ancillary Justice
9,"The Iron King (The Iron Fey, #1)",Julie Kagawa,Meghan Chase has a secret destiny; one she cou...,3.919998,6644117,6644117.The Iron King


In [9]:
for idx, i in enumerate(res.book_id):
    print(res[res.book_id==i][['title','name']])

              title          name
0  Ready Player One  Ernest Cline
                             title              name
1  Ender's Game (Ender's Saga, #1)  Orson Scott Card
                     title        name
2  Wool Omnibus (Silo, #1)  Hugh Howey
                title                    name
3  Obsidian (Lux, #1)  Jennifer L. Armentrout
                            title         name
4  Hyperion (Hyperion Cantos, #1)  Dan Simmons
              title               name
5  The Dispossessed  Ursula K. Le Guin
             title                   name
6  Spin (Spin, #1)  Robert Charles Wilson
                  title                name
7  The Door Into Summer  Robert A. Heinlein
                                    title        name
8  Ancillary Justice (Imperial Radch, #1)  Ann Leckie
                              title          name
9  The Iron King (The Iron Fey, #1)  Julie Kagawa


import pandas as pd
test = pd.read_json('/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/20603758.Ready Player One.json')

In [16]:
from collections import Counter

# Creating cluster function

In [1]:
books[books.title.isin(['Children of Time'])].book_id.tolist()

NameError: name 'books' is not defined

In [9]:
from sklearn.cluster import KMeans

In [3]:
titles_list = ['Children of Time']
input_books = reviews[reviews.book_id.isin(books[books.title.isin(titles_list)].book_id.tolist())]

NameError: name 'reviews' is not defined

In [2]:
input_books

NameError: name 'input_books' is not defined

In [47]:
reviews

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,831635,549bc8e5f48d65e981cf2abb3ff7ee6f,0,Recommended by our pediatrician,Mon May 16 12:29:47 -0700 2011,0,12
1,8842281e1d1347389f2ab93d60773d4d,27539,b45353478a136de0a3e93077d47b1d6b,0,6 of my smartest friends rated this 4 or highe...,Sun Nov 15 20:28:29 -0800 2009,0,2
2,8842281e1d1347389f2ab93d60773d4d,259028,fb4acc8a30bac6bf1414a03303d43c26,0,"If steve recommends it, it must be good!",Thu Jan 18 11:09:48 -0800 2007,2,2
3,8842281e1d1347389f2ab93d60773d4d,5558,3c01b327de615dc96910130797da1224,0,elizabeth got this for free at work... anyone ...,Thu Dec 07 16:43:16 -0800 2006,0,0
4,8842281e1d1347389f2ab93d60773d4d,1202,8458284a7dcf85d90b3c55bd6b4523d4,4,I loved the abortion cured crime theory.,Mon Nov 13 12:31:32 -0800 2006,6,2
...,...,...,...,...,...,...,...,...
429621,349751201c57bc3b652590c6c90d894c,12067,cf4da327f4a32c6325eca4f8c52ff11c,4,Lots of fun. I'll have to try some more Terry ...,Wed Sep 10 20:54:42 -0700 2008,0,0
429622,349751201c57bc3b652590c6c90d894c,5907,ac0e9d4b8b2ab0144402c22d90ba1df3,5,What can I say about Tolkien? Just read his wo...,Wed Jul 30 18:50:48 -0700 2008,0,0
429623,621c5857e423b0f2283a22604d1796a0,47956,50b83b9ea55af86f2561fa35ba1ff483,5,This book is an exhilarating book of magic and...,Wed Jul 30 20:59:21 -0700 2008,0,0
429624,2c2733058db98a841851789c132e0334,5139,200307bd0e38a59982a77a89157771d9,4,"Really enjoyed this book. Crazy, fast paced fa...",Mon Mar 03 04:22:30 -0800 2014,0,0


In [50]:
book_title = 'Children of Time'
review_sentences = reviews[reviews.book_id.isin(books[books.title.isin([book_title])].book_id.tolist())]

In [51]:
review_sentences

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,n_votes,n_comments
25743,e4521937c11019458c255a2b0e2b4478,25499718,5d905af5b0c38ce50b774a27201b16bb,5,Really liked this one. Great world building,Fri May 26 14:16:55 -0700 2017,0,0
30878,dea045d73d8f99ce94b9a553db883adc,25499718,a90dcd0aab1ec39a8510eda6e3599222,5,Fantastic and original science fiction.,Mon Aug 07 14:29:40 -0700 2017,2,0
31678,8338c8c8200fe91ea3003b266178e5e6,25499718,ca84f1ba114f8a8e80c4498065db5962,5,Very enjoyable book. Unexpected ending. Settin...,Fri Aug 25 14:37:04 -0700 2017,0,0
76169,47fae68ca65e21b07f87d41011959a38,25499718,d5e7e630b91becc8a7ec4d4c7d5f9bd6,4,I was surprised at how good this was. \n That ...,Sun Jun 14 03:19:04 -0700 2015,0,0
79342,2a3861c918a64eebd8df90d73b15b1d1,25499718,0e3d1a17405c1fc41881b4abf637ec85,4,"Excellent novel, interesting concepts that mak...",Thu Aug 13 07:19:29 -0700 2015,0,0
92377,d8b2e3c0dda7aff9960c3152d0736866,25499718,0efb2d7f53bbf0035305e88226bc0c05,5,A very unique and well written story.,Mon May 08 04:11:52 -0700 2017,0,0
117367,25eb1bd04399b96a2fd34ceed7415136,25499718,7a194c2ad2e070181b702eb8c9141bde,4,Very imaginative. Well crafted and original. H...,Sat Jan 07 07:28:10 -0800 2017,0,0
132593,73fa6167cab90e3a5a9b9264ff79a54b,25499718,2654f9623501fa19b95f0812f27752ad,4,"Good read, reminded me of Vernor Vinge and his...",Sun Sep 17 03:53:44 -0700 2017,0,0
139035,360cbcc37573e09cb6662da5c1837d4f,25499718,a5d0faac62ec0fa50cc86a7be3c45531,4,Spiders are better than people. I love it.,Wed Jun 28 14:21:03 -0700 2017,0,0
150883,d7b8e12475810415abe85902c56d8a14,25499718,6b888359ce288026a2fa4d07783351d6,4,** spoiler alert ** \n Meet Porcia \n Spider W...,Mon Dec 19 02:59:02 -0800 2016,0,0


In [66]:
def load_sentences(book_title):
    '''
    Function to load and embed a book's sentences
    '''
    # Copy sentence column to new variable
    sentences = reviews[reviews.book_id.isin(books[books.title.isin([book_title])].book_id.tolist())]['review_text']

    # Vectorize sentences
    sentence_vectors = embed(sentences)
    
    return sentences, sentence_vectors

def get_clusters(sentences, sentence_vectors, k, n):
    '''
    Function to extract the n most representative sentences from k clusters
    '''

    # Instantiate the model
    kmeans_model = KMeans(n_clusters=k)

    # Fit the model
    kmeans_model.fit(sentence_array)
    
    # Loop through number of clusters
    clusters = pd.DataFrame()
    for i in range(k):

        # Define cluster centre
        centre = kmeans_model.cluster_centers_[i]

        # Calculate inner product of cluster centre and sentence vectors
        ips = np.inner(centre, sentence_vectors)

        # Find the sentences with the highest inner products
        top_indices = pd.Series(ips).nlargest(n).index
        top_sentences = list(sentences.iloc[top_indices])

        to_append = pd.Series([top_sentences, k])
        print(to_append)
        clusters.append(to_append, ignore_index=True)
    return clusters

In [67]:
from sklearn.cluster import KMeans

In [68]:
sentences, sentence_vectors = load_sentences(book_title)

In [74]:
sentences

25743           Really liked this one. Great world building
30878               Fantastic and original science fiction.
31678     Very enjoyable book. Unexpected ending. Settin...
76169     I was surprised at how good this was. \n That ...
79342     Excellent novel, interesting concepts that mak...
92377                 A very unique and well written story.
117367    Very imaginative. Well crafted and original. H...
132593    Good read, reminded me of Vernor Vinge and his...
139035           Spiders are better than people. I love it.
150883    ** spoiler alert ** \n Meet Porcia \n Spider W...
157642    Lovely theme of a different evolutionary outco...
186069    Excellent! Can't wait to read more books by th...
194830    Wheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee...
206205              Stunning. Hugo award winner in waiting.
220905                Brilliant! I wish there were sequels!
299113                     A great story, a very quick read
344076    Pretty much blew my mind and g

In [73]:
data = get_clusters(sentences, sentence_vectors, k=2, n=1)

In [72]:
data

In [45]:
def load_sentences(book_title):
    '''
    Function to load and embed a book's sentences
    '''
    # Copy sentence column to new variable
    sentences = input_books['review_text']

    # Vectorize sentences
    sentence_vectors = embed(sentences)
    
    return sentences, sentence_vectors

def get_clusters(sentences, sentence_vectors, k, n):
    '''
    Function to extract the n most representative sentences from k clusters
    '''

    # Instantiate the model
    kmeans_model = KMeans(n_clusters=k)

    # Fit the model
    kmeans_model.fit(sentence_array)
    
    # Loop through number of clusters
    for i in range(k):

        # Define cluster centre
        centre = kmeans_model.cluster_centers_[i]

        # Calculate inner product of cluster centre and sentence vectors
        ips = np.inner(centre, sentence_vectors)

        # Find the sentences with the highest inner products
        top_indices = pd.Series(ips).nlargest(n).index
        top_sentences = list(sentences.iloc[top_indices])

In [28]:
book_sentences, book_review_vectors = load_sentences(input_books)

In [31]:
get_clusters(book_sentences, book_review_vectors, k=5, n=8)

Cluster #1 sentences:

Really liked this one. Great world building

Very enjoyable book. Unexpected ending. Setting the stage for more.

Brilliant! I wish there were sequels!

Pretty much blew my mind and got me thinking about everything I thought I knew...

A very unique and well written story.

A great story, a very quick read

I was surprised at how good this was. 
 That is not something I experience very often.

Excellent! Can't wait to read more books by this author!!


Cluster #2 sentences:

Excellent! Can't wait to read more books by this author!!

Brilliant! I wish there were sequels!

Good read, reminded me of Vernor Vinge and his Zones of thought series.

Very enjoyable book. Unexpected ending. Setting the stage for more.

Really liked this one. Great world building

Excellent novel, interesting concepts that make you think.

A very unique and well written story.

A great story, a very quick read


Cluster #3 sentences:

Excellent! Can't wait to read more books by this author