In [1]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow as tf
import tensorflow_hub as hub

# To silence warnings from TensorFlow
import os
import logging
import warnings;
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

# To load saved embeddings
import joblib

# To create webapp
import psutil


#######################################################################################
                            # Load functions
#######################################################################################

def load_embeddings():
    # Path to USE
    embed = hub.load('/media/einhard/Seagate Expansion Drive/3380_data/data/tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    sentence_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/reviewEmbeddings.pkl')
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionEmbeddings.pkl')
    return embed, sentence_array, descriptions_array

def data_loader(datapath, books_file, reviews_file):
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1)
    reviews = pd.read_csv(datapath + reviews_file).drop('Unnamed: 0', axis=1)
    return books, reviews


                    # Return recommendations based on reviews
def find_reviews(query,reviews, n_results=5):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, sentence_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results+1)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(reviews.review_text.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar reviews: \n\n')
    #print(*top_n_list, sep='\n\n')
    #print(top_n_indices)
    return top_n_indices

def find_books(query, reviews, books, n_results=5):
    top_n_indices = find_reviews(query, reviews, n_results)
    return books[books.book_id.isin(reviews.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('').reset_index().drop('index', axis=1)

                    # Return recommendations based on descriptions

def find_description(query, books, n_results=10):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, descriptions_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_results)
    top_n_indices = top_n_sentences.index.tolist()
    top_n_list = list(books.description.iloc[top_n_indices][1:])

    #print(f'Input sentence: "{query}"\n')
    #print(f'{n_results} most semantically similar book descriptions: \n\n')
    #print(*top_n_list, sep='\n\n')
    return top_n_indices

def find_books_description(query, reviews, books):
    top_n_indices = find_description(query)
    return books[books.book_id.isin(books.iloc[top_n_indices].book_id.tolist())][['title', 'name','description', 'weighted_score', 'book_id']].fillna('')



#######################################################################################
                            # Load variables and data
#######################################################################################

# Path to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/'

# Books and reviews file names and loading
books_file = 'filtered_books.csv'
reviews_file = 'filtered_reviews.csv'

books, reviews = data_loader(datapath, books_file, reviews_file)
embed, sentence_array, descriptions_array = load_embeddings()


#######################################################################################
                                # Web App
#######################################################################################

In [25]:
res = find_books('I love hard scifi', reviews, books, 10)

In [26]:
res

Unnamed: 0,title,name,description,weighted_score,book_id
0,Ready Player One,Ernest Cline,A world at stake.\nA quest for the ultimate pr...,4.309457,20603758
1,"Ender's Game (Ender's Saga, #1)",Orson Scott Card,"Andrew ""Ender"" Wiggin thinks he is playing com...",4.299977,375802
2,"Wool Omnibus (Silo, #1)",Hugh Howey,This Omnibus Edition collects the five Wool bo...,4.239805,13453029
3,"Obsidian (Lux, #1)",Jennifer L. Armentrout,Starting over sucks.\nWhen we moved to West Vi...,4.229906,12578077
4,"Hyperion (Hyperion Cantos, #1)",Dan Simmons,"On the world called Hyperion, beyond the law o...",4.209882,77566
5,The Dispossessed,Ursula K. Le Guin,"Shevek, a brilliant physicist, decides to take...",4.179725,13651
6,"Spin (Spin, #1)",Robert Charles Wilson,One night in October when he was ten years old...,4.009821,910863
7,The Door Into Summer,Robert A. Heinlein,"It is 1970, and electronics engineer Dan Davis...",3.999726,348
8,"Ancillary Justice (Imperial Radch, #1)",Ann Leckie,"On a remote, icy planet, the soldier known as ...",3.969932,17333324
9,"The Iron King (The Iron Fey, #1)",Julie Kagawa,Meghan Chase has a secret destiny; one she cou...,3.919998,6644117


# Getting book_id.title format for scrapper

In [27]:
res['for_url'] = res['book_id'].astype('str') + '.' + res['title'].str.replace(r"\(.*$", '')

In [32]:
res.for_url.to_csv(datapath + 'book_ids', header=None, index=None, mode='a')

In [None]:
!python get_reviews.py --book_ids_path goodreads_classics_sample.txt \
--output_directory_path classic_book_reviews --sort_order default --browser chrome

In [7]:
def filter_scraped_reviews(reviews_df):
    reviews_df.text.dropna(inplace=True)
    # reviews_df = reviews_df[(reviews_df.text.str.len() < max_review_length)]

    # Remove reviews that specify rating in body of text
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'\d\sstar', case=False, regex=True, na=False)]
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'\d\/\d', case=False, regex=True, na=False)]
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'\d\.\d', case=False, regex=True, na=False)]
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'\d\S+\D', case=False, regex=True, na=False)]

    # Remove reviews that contain link to other websites
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'http\S+', case=False, regex=True, na=False)]
    reviews_df = reviews_df[~reviews_df.text.str.contains(r'www\S+', case=False, regex=True, na=False)]
    return reviews_df.text

In [8]:
import pandas as pd
test = pd.read_json('/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/20603758.Ready Player One.json')

In [9]:
test

Unnamed: 0,book_id_title,book_id,book_title,review_url,review_id,date,rating,user_name,user_url,text,num_likes,sort_order,shelves
0,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/200552364,200552364,2011-08-20,1,Kemper,/user/show/405390-kemper,I originally gave this book 3 stars as harmles...,2560,default,"[2011, cyber, sci-fi, future-is-now, humor, dy..."
1,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/1464451121,1464451121,2015-12-09,4,"Khanh, first of her name, mother of bunnies",/user/show/4527753-khanh-first-of-her-name-mot...,This book is a geek fantasy. A nerd utopia. Sp...,764,default,[]
2,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/205484356,205484356,2011-09-05,2,William Cline,/user/show/5006423-william-cline,"For most of the first half of this book, I was...",2054,default,[]
3,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/264022073,264022073,2012-01-18,1,Melissa McShane,/user/show/1329059-melissa-mcshane,ETA: At the risk of getting this more attentio...,775,default,"[science-fiction, did-not-finish, dystopian, f..."
4,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/171002486,171002486,2011-05-25,5,Patrick,/user/show/922495-patrick,"I got to read an ARC of this, and it appealed ...",1389,default,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/2169363535,2169363535,2017-11-03,5,Dennis,/user/show/67496521-dennis,5/5 stars!!!\nReady Player One\n was the ride ...,62,default,"[sci-fi, dystopian, 2017-read]"
146,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/1089443319,1089443319,2014-10-26,5,Kaora,/user/show/7983564-kaora,My husband and I are HUGE nerds. We have a nin...,62,default,[]
147,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/3107648613,3107648613,2019-12-30,1,Zitong Ren,/user/show/77983193-zitong-ren,What a sad way to end the year and decade.This...,59,default,"[diverse, dystopian, action, young-adult, 2019..."
148,20603758.Ready Player One,20603758,Ready Player One,https://www.goodreads.com/review/show/1348220177,1348220177,2015-07-28,4,Glenn Sumi,/user/show/28854579-glenn-sumi,"UPDATE: March 2018, after watching Steven Spie...",59,default,[]


In [10]:
filter_scraped_reviews(test)

4      I got to read an ARC of this, and it appealed ...
8      ladies and gentlemen, from this day this book ...
15     This reached into the gamer part of my heart a...
17     I just kinda wanna cry right now. I'll have a ...
29     THIS IS POSSIBLY THE BEST BOOK IN EXISTENCE. L...
33     I had the feeling while reading this book that...
38     THAT WAS SO TOTALLY AWESOME, WOW! I have not e...
40     Loved this! I know many people also loved this...
43                    This book was so much fun to read.
45     \n\nit's like Ernest Cline decided to do somet...
46     *had to bump this up a star after watching the...
48     I loved this book, but mostly because I read W...
49     I enjoyed this book, although with all the hyp...
61     This book. I approve of this book.So I found o...
71     I think I would have liked this a lot more if ...
75     Such a refreshingly unique dystopian novel. Lo...
76     Oh. My. God. This. Book. Let me be clear by sa...
81     I wish I could give this

# Creating cluster function

In [8]:
books[books.title.isin(['Children of Time'])].book_id.tolist()

[25499718]

In [9]:
from sklearn.cluster import KMeans

In [3]:
titles_list = ['Children of Time']
input_books = reviews[reviews.book_id.isin(books[books.title.isin(titles_list)].book_id.tolist())]

NameError: name 'reviews' is not defined

In [2]:
input_books

NameError: name 'input_books' is not defined

In [29]:
def load_sentences(review_texts):
    '''
    Function to load and embed a book's sentences
    '''
    # Copy sentence column to new variable
    sentences = input_books['review_text']

    # Vectorize sentences
    sentence_vectors = embed(sentences)
    
    return sentences, sentence_vectors

def get_clusters(sentences, sentence_vectors, k, n):
    '''
    Function to extract the n most representative sentences from k clusters
    '''

    # Instantiate the model
    kmeans_model = KMeans(n_clusters=k)

    # Fit the model
    kmeans_model.fit(sentence_array)
    
    # Loop through number of clusters
    for i in range(k):

        # Define cluster centre
        centre = kmeans_model.cluster_centers_[i]

        # Calculate inner product of cluster centre and sentence vectors
        ips = np.inner(centre, sentence_vectors)

        # Find the sentences with the highest inner products
        top_indices = pd.Series(ips).nlargest(n).index
        top_sentences = list(sentences.iloc[top_indices])

        print(f'Cluster #{i+1} sentences:\n')
        print(*top_sentences, sep='\n\n')
        print('\n')

In [28]:
book_sentences, book_review_vectors = load_sentences(input_books)

In [31]:
get_clusters(book_sentences, book_review_vectors, k=5, n=8)

Cluster #1 sentences:

Really liked this one. Great world building

Very enjoyable book. Unexpected ending. Setting the stage for more.

Brilliant! I wish there were sequels!

Pretty much blew my mind and got me thinking about everything I thought I knew...

A very unique and well written story.

A great story, a very quick read

I was surprised at how good this was. 
 That is not something I experience very often.

Excellent! Can't wait to read more books by this author!!


Cluster #2 sentences:

Excellent! Can't wait to read more books by this author!!

Brilliant! I wish there were sequels!

Good read, reminded me of Vernor Vinge and his Zones of thought series.

Very enjoyable book. Unexpected ending. Setting the stage for more.

Really liked this one. Great world building

Excellent novel, interesting concepts that make you think.

A very unique and well written story.

A great story, a very quick read


Cluster #3 sentences:

Excellent! Can't wait to read more books by this author