In [25]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow_hub as hub

# To create sentence clusters
from sklearn.cluster import KMeans

# To load saved embeddings
import joblib

# To match strings
import re

# To sort final recommendation list
from collections import Counter

def dataLoader(datapath, books_file, reviews_file, reviewsAll_file):
    '''
    Loads DataFrames with books and review information
    '''
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1).fillna('')
    reviews = pd.read_csv(datapath + reviews_file).drop('Unnamed: 0', axis=1)
    reviewsAll = pd.read_csv(datapath + reviewsAll_file).drop('Unnamed: 0', axis=1)
    return books, reviews, reviewsAll

def loadEmbeddings():
    '''
    Loads pre-trained sentence and review arrays
    '''
    # Path to USE
    embed = hub.load('/media/einhard/Seagate Expansion Drive/3380_data/data/tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    ## Reviews array is a set of embeddings trained on review lengths of < 90 characters
    reviews_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/reviewEmbeddings.pkl')
    ## Descriptions array is a set of embeddings trained on all book descriptions
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionEmbeddings.pkl')

    return embed, reviews_array, descriptions_array

def embedInputs(books_df, review_df, search_param, review_max_len, searchTitle=True):
    '''
    Converts input reviews into USE arrays. Returns vectorized reviews for the book that was
    passed as bookTitle.

    Args:
        search_param = List of book titles or author names whose reviews we want to embed. For authors, see 'searchTitle' argument
        books_df = DataFrame with book_id information
        review_df = DataFrame with book_id and review text
        searchTitle = If True, will search for book_id based on title of a book. If False, it will look for author names to find book_id.
    '''
    if searchTitle:
        #Finds book_id from title of book
        input_book_id = books_df[books_df.title.isin([search_param])].book_id.tolist()
    else:
        # Finds book_id from author name
        input_book_id = books_df[books_df.name.isin([search_param])].book_id.tolist()

    # Finds reviews for specified book
    input_sentences = review_df[review_df.book_id.isin(input_book_id)].review_text

    # Filters review length
    input_sentences = input_sentences[input_sentences.str.len() <= review_max_len]

    # Converts reviews into 512-dimensional arrays
    input_vectors = embed(input_sentences)

    # Returns reviews and vectorized reviews for a particular book
    return input_sentences, input_vectors

def getClusters(input_vectors, n_clusters):
    '''
    Creates KMeans instance and fits model.

    The for nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        n_clusters = How many clusters to generate
    '''
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, algorithm='full')
    return kmeans.fit(input_vectors)


def showClusters(input_sentences, input_vectors, authorTitle, n_clusters, n_results, model, searchTitle=True):
    '''
    This function will find theme clusters in the reviews of a particular book or set of sentences.
    Uses cluster centers to find semantically similar sentences to the input vectors.

    The nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        input_vectors = USE Array generated by embedding input sentences
        authorTitle = Title of book in question, or name of author --> Used to display header only
        n_clusters = How many clusters to generate
        n_results = How many sentences to display per n_cluster
        model = The model used to create the clusters.
    '''
    if searchTitle:
        # Displays which book's reviews are being clustered
        print(f'Opinion clusters about *{authorTitle}*')
    else:
        print(f'Opinion clusters about {authorTitle}\'s books')

    # Iterates through centroids to and computes inner products to find nlargest
    for i in range(n_clusters):
        centre = model.cluster_centers_[i]
        inner_product = np.inner(centre, input_vectors)
        indices = pd.Series(inner_product).nlargest(n_results).index
        clusteredInputs = list(input_sentences.iloc[indices])

        # Prints reviews that are closest to centroid
    
        print(f'**Cluster #{i+1}**')
        for sentence in clusteredInputs:
            print(sentence)



In [26]:
# Paths to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/'
#books_file = 'clean_filtered_books.csv'
#reviews_file = 'clean_filtered_reviews.csv'
#reviewsAll_file = 'reviews_for_cluster.csv'#

## Loading DataFrames
#books, reviews, reviewsAll = dataLoader(datapath, books_file, reviews_file, reviewsAll_file)

# Loadding pre-trained embeddings and embedder for input sentences
embed, reviews_array, descriptions_array = loadEmbeddings()

# Tokenizing reviews into sentences

In [5]:
booksdescription	

Unnamed: 0,title,description,book_id,weighted_score,name
0,Poet Of The Wrong Generation,"""It's not that I don't love you, and my tears ...",31675691,4.805233,Lonnie Ostrow
1,"Words of Radiance (The Stormlight Archive, #2)",From #1 New York Times bestselling author Bran...,17332218,4.769454,Brandon Sanderson
2,Mark of the Lion Trilogy,,95602,4.755426,Francine Rivers
3,The Jesus Storybook Bible: Every Story Whisper...,The Moonbeam Award Gold Medal Winner in the re...,165068,4.743188,Sally Lloyd-Jones
4,"Lodestar (Keeper of the Lost Cities, #5)",Dark schemes unfold--and Sophie's loyalty is p...,27272698,4.742152,Shannon Messenger
...,...,...,...,...,...
12357,The Birthing House,,6017367,2.476325,Christopher Ransom
12358,One Night at the Call Center,Press 1 for technical support.\nPress 2 for br...,105578,2.471741,Chetan Bhagat
12359,"Cleaving: A Story of Marriage, Meat, and Obses...",Julie Powell thought cooking her way through J...,6072179,2.440292,Julie Powell
12360,Citizen Girl,Another biting satire from Emma McLaughlin and...,33993,2.404365,Emma McLaughlin


In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
def make_sentences(reviews_df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Initialize dataframe to store review sentences, and counter
    sentences_df = pd.DataFrame()
    ctr = 0

    print(f'Starting tokenization')

    # Loop through each review
    for i in range(len(reviews_df)):

        # Save row and review to variables
        row = reviews_df.iloc[i]
        review = row.loc['description']

        # Tokenize review into sentences
        sentences = sent_tokenize(review)

        # Loop through each sentence in list of tokenized sentences
        for sentence in sentences:
            # Add row for sentence to sentences dataframe
            new_row = row.copy()
            new_row.at['description'] = sentence
            sentences_df = sentences_df.append(new_row, ignore_index=True)

        ctr += 1
        if (ctr % 500 == 0):
            print(f'{ctr} reviews tokenized')

    print(f'Tokenization complete: {len(sentences_df)} sentences tokenized\n')

    # Rename review column
    sentences_df.rename(columns={'description':'description_tokenized'}, inplace=True)

    return sentences_df

In [8]:
descriptionsTokenized = make_sentences(books)

Starting tokenization
500 reviews tokenized
1000 reviews tokenized
1500 reviews tokenized
2000 reviews tokenized
2500 reviews tokenized
3000 reviews tokenized
3500 reviews tokenized
4000 reviews tokenized
4500 reviews tokenized
5000 reviews tokenized
5500 reviews tokenized
6000 reviews tokenized
6500 reviews tokenized
7000 reviews tokenized
7500 reviews tokenized
8000 reviews tokenized
8500 reviews tokenized
9000 reviews tokenized
9500 reviews tokenized
10000 reviews tokenized
10500 reviews tokenized
11000 reviews tokenized
11500 reviews tokenized
12000 reviews tokenized
Tokenization complete: 105923 sentences tokenized



In [10]:
descriptionsTokenized.to_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/tokenized sentences/descriptionsTokenized.csv')

In [2]:
import pandas as pd
descriptionsTokenized = pd.read_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/tokenized sentences/descriptionsTokenized.csv').drop('Unnamed: 0', axis=1)

In [24]:
descriptionsTokenized[(descriptionsTokenized.description_tokenized.str.len() > 10)]

Unnamed: 0,book_id,description_tokenized,name,title,weighted_score
0,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,Poet Of The Wrong Generation,4.805233
1,31675691.0,But you can't go back and forth forever and we...,Lonnie Ostrow,Poet Of The Wrong Generation,4.805233
2,31675691.0,"Through these words, a young poet unearths his...",Lonnie Ostrow,Poet Of The Wrong Generation,4.805233
3,31675691.0,"Unknowingly, in writing this ballad of liberat...",Lonnie Ostrow,Poet Of The Wrong Generation,4.805233
4,31675691.0,"The year is 1991; the place, New York City.",Lonnie Ostrow,Poet Of The Wrong Generation,4.805233
...,...,...,...,...,...
105917,17819467.0,"OK. A present, passons aux choses serieuses.Sa...",Sasha Grey,"The Juliette Society (The Juliette Society, #1)",2.351404
105918,17819467.0,Apres avoir quitte l'univers des films pour ad...,Sasha Grey,"The Juliette Society (The Juliette Society, #1)",2.351404
105919,17819467.0,Juliette Societyest son premier roman.,Sasha Grey,"The Juliette Society (The Juliette Society, #1)",2.351404
105920,17819467.0,Un feminisme moderne pousse a l'extreme.,Sasha Grey,"The Juliette Society (The Juliette Society, #1)",2.351404


In [142]:
input_text = embed(['Poet of the Wrong Generation tells the symmetrical story of a lovable underdog and his meteoric rise to stardom, his humiliating downfall and his unprecedented attempt to reclaim his place as the unlikely musical spokesman for his generation.'])

In [149]:
test_embed = embed(descriptionsTokenized[:20].description_tokenized.reset_index().drop('index', axis=1).description_tokenized)

In [150]:
import numpy as np

In [145]:
inner_product = descriptionsTokenized[:20].description_tokenized.reset_index().drop('index', axis=1).description_tokenized.apply(lambda row: np.inner(test_embed, input_text)

In [146]:
pd.DataFrame(inner_product)

Unnamed: 0,0
0,0.990335


In [139]:
pd.DataFrame(inner_product).nlargest(5, columns=0).index

Int64Index([0], dtype='int64')

In [140]:
indices = pd.DataFrame(inner_product).nlargest(5, columns=0).index

In [141]:
descriptionsTokenized.description_tokenized[indices]

0    "It's not that I don't love you, and my tears ...
Name: description_tokenized, dtype: object

In [133]:
descriptionsTokenized.description_tokenized[indices].tolist()

['Through these words, a young poet unearths his musical soul while severing ties with the woman he loves after her stunning betrayal.',
 'Expected by his enemies to die the miserable death of a military slave, Kaladin survived to be given command of the royal bodyguards, a controversial first for a low-status "darkeyes."',
 'From Noah to Moses to the great King David--every story points to him.',
 'The year is 1991; the place, New York City.',
 'At the heart of Poet is a tale of star-crossed lovers and their struggle with unforeseen success and disillusionment, in an attempt to rediscover lasting harmony.']

In [64]:
descriptionsTokenized[20:50].description_tokenized

20    The secrets she needs can be found at the Shat...
21    Meanwhile, at the heart of the Shattered Plain...
22    Hard pressed by years of Alethi attacks, their...
23    The possible consequences for Parshendi and hu...
24    The Moonbeam Award Gold Medal Winner in the re...
25    At the center of the Story is a baby, the chil...
26                       Every story whispers his name.
27    From Noah to Moses to the great King David--ev...
28    He is like the missing piece in a puzzle--the ...
29    From the Old Testament through the New Testame...
30    A Bible like no other, The Jesus Storybook Bib...
31    Dark schemes unfold--and Sophie's loyalty is p...
32    Sophie Foster is back in the Lost Cities--but ...
33    The threat of war hangs heavy over her glitter...
34    The lines between friend and enemy have blurre...
35    But when she's warned that the people she love...
36    A mysterious symbol could be the key--if only ...
37    Every new clue seems to lead deeper into h

In [71]:
descriptionsTokenized[20:50].description_tokenized.reset_index().drop('index', axis=1).description_tokenized

0     The secrets she needs can be found at the Shat...
1     Meanwhile, at the heart of the Shattered Plain...
2     Hard pressed by years of Alethi attacks, their...
3     The possible consequences for Parshendi and hu...
4     The Moonbeam Award Gold Medal Winner in the re...
5     At the center of the Story is a baby, the chil...
6                        Every story whispers his name.
7     From Noah to Moses to the great King David--ev...
8     He is like the missing piece in a puzzle--the ...
9     From the Old Testament through the New Testame...
10    A Bible like no other, The Jesus Storybook Bib...
11    Dark schemes unfold--and Sophie's loyalty is p...
12    Sophie Foster is back in the Lost Cities--but ...
13    The threat of war hangs heavy over her glitter...
14    The lines between friend and enemy have blurre...
15    But when she's warned that the people she love...
16    A mysterious symbol could be the key--if only ...
17    Every new clue seems to lead deeper into h

In [69]:
descriptionsTokenized[20:50].description_tokenized

20    The secrets she needs can be found at the Shat...
21    Meanwhile, at the heart of the Shattered Plain...
22    Hard pressed by years of Alethi attacks, their...
23    The possible consequences for Parshendi and hu...
24    The Moonbeam Award Gold Medal Winner in the re...
25    At the center of the Story is a baby, the chil...
26                       Every story whispers his name.
27    From Noah to Moses to the great King David--ev...
28    He is like the missing piece in a puzzle--the ...
29    From the Old Testament through the New Testame...
30    A Bible like no other, The Jesus Storybook Bib...
31    Dark schemes unfold--and Sophie's loyalty is p...
32    Sophie Foster is back in the Lost Cities--but ...
33    The threat of war hangs heavy over her glitter...
34    The lines between friend and enemy have blurre...
35    But when she's warned that the people she love...
36    A mysterious symbol could be the key--if only ...
37    Every new clue seems to lead deeper into h

In [152]:
descriptionsTokenized['embedded'] = descriptionsTokenized['description_tokenized'].apply(lambda row: embed([row]))

In [191]:
input_text = embed(['Through these words, a young poet unearths his musical soul while severing ties with the woman he loves after her stunning betrayal'])

In [192]:
test = np.inner(descriptionsTokenized.embedded.tolist(), input_text)

KeyboardInterrupt: 

In [213]:
np.asarray(descriptionsTokenized.embedded.tolist())

KeyboardInterrupt: 

In [185]:
len(test)

20

In [188]:
test_index = pd.Series(test.reshape(len(test))).nlargest(5).index.tolist()

In [189]:
test_index

[2, 10, 3, 5, 8]

In [190]:
descriptionsTokenized.description_tokenized[test_index].tolist()

['Through these words, a young poet unearths his musical soul while severing ties with the woman he loves after her stunning betrayal.',
 'At the heart of Poet is a tale of star-crossed lovers and their struggle with unforeseen success and disillusionment, in an attempt to rediscover lasting harmony.',
 'Unknowingly, in writing this ballad of liberation, he will soon evolve as one of the fastest rising stars on the pop music landscape.',
 'Here we meet Johnny Elias, a college student from Brooklyn with boundless adoration for two things in life: timeless popular music, and the heart of a sweet, complicated young woman who is clearly out of his league.',
 'But in her callous disregard, she inadvertently sets him on a determined course to his improbable musical destiny - while sending her own daughter spiraling down a path of devastation.']