In [1]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow_hub as hub
from langdetect import detect
from pathlib import Path
from nltk.tokenize import sent_tokenize

# To create recommendations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# To create sentence clusters
from sklearn.cluster import KMeans

# To load saved embeddings
import joblib

# To match strings
import re

# To create webapp
import psutil
import streamlit as st
st.set_page_config(page_title='3380 Books',
                  layout="wide",
                  page_icon= ':books:')
from streamlit import caching


# To sort final recommendation list
from collections import Counter

def dataLoader(datapath, books_file, reviews_file, reviewsAll_file):
    '''
    Loads DataFrames with books and review information
    '''
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1).fillna('')
    reviews = pd.read_csv(datapath + reviews_file).drop('Unnamed: 0', axis=1)
    reviewsAll = pd.read_csv(datapath + reviewsAll_file).drop('Unnamed: 0', axis=1)
    return books, reviews, reviewsAll

def loadEmbeddings():
    '''
    Loads pre-trained sentence and review arrays
    '''
    # Path to USE
    embed = hub.load('/media/einhard/Seagate Expansion Drive/3380_data/data/tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    ## Reviews array is a set of embeddings trained on review lengths of < 90 characters
    reviews_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/reviewEmbeddings.pkl')
    ## Descriptions array is a set of embeddings trained on all book descriptions
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionEmbeddings.pkl')

    return embed, reviews_array, descriptions_array

def embedInputs(books_df, review_df, search_param, review_max_len, searchTitle=True):
    '''
    Converts input reviews into USE arrays. Returns vectorized reviews for the book that was
    passed as bookTitle.

    Args:
        search_param = List of book titles or author names whose reviews we want to embed. For authors, see 'searchTitle' argument
        books_df = DataFrame with book_id information
        review_df = DataFrame with book_id and review text
        searchTitle = If True, will search for book_id based on title of a book. If False, it will look for author names to find book_id.
    '''
    if searchTitle:
        #Finds book_id from title of book
        input_book_id = books_df[books_df.title.isin([search_param])].book_id.tolist()
    else:
        # Finds book_id from author name
        input_book_id = books_df[books_df.name.isin([search_param])].book_id.tolist()

    # Finds reviews for specified book
    input_sentences = review_df[review_df.book_id.isin(input_book_id)].review_text

    # Filters review length
    input_sentences = input_sentences[input_sentences.str.len() <= review_max_len]

    # Converts reviews into 512-dimensional arrays
    input_vectors = embed(input_sentences)

    # Returns reviews and vectorized reviews for a particular book
    return input_sentences, input_vectors

def getClusters(input_vectors, n_clusters):
    '''
    Creates KMeans instance and fits model.

    The for nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        n_clusters = How many clusters to generate
    '''
    tsne = TSNE(n_components=3, verbose=1, perplexity=80, n_iter=5000, learning_rate=200)
    TSNE_transformed = tsne.fit_transform(input_vectors)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, algorithm='full')
    return kmeans.fit(TSNE_transformed)


def showClusters(input_sentences, input_vectors, authorTitle, n_clusters, n_results, model, searchTitle=True):
    '''
    This function will find theme clusters in the reviews of a particular book or set of sentences.
    Uses cluster centers to find semantically similar sentences to the input vectors.

    The nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        input_vectors = USE Array generated by embedding input sentences
        authorTitle = Title of book in question, or name of author --> Used to display header only
        n_clusters = How many clusters to generate
        n_results = How many sentences to display per n_cluster
        model = The model used to create the clusters.
    '''
    if searchTitle:
        # Displays which book's reviews are being clustered
        print(f'Opinion clusters about *{authorTitle}*')
    else:
        print(f'Opinion clusters about {authorTitle}\'s books')

    # Iterates through centroids to and computes inner products to find nlargest
    for i in range(n_clusters):
        centre = model.cluster_centers_[i]
        inner_product = np.inner(centre, input_vectors)
        indices = pd.Series(inner_product).nlargest(n_results).index
        clusteredInputs = list(input_sentences.iloc[indices])

        # Prints reviews that are closest to centroid
    
        print(f'**Cluster #{i+1}**')
        for sentence in clusteredInputs:
            print(sentence)
        return indices, inner_product

def cleanAndTokenize(df, filepath, searchTitle, author):
    if searchTitle:
        if Path(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv').is_file():
            sentences_df = pd.read_csv(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv').drop('Unnamed: 0', axis=1)
        else:
            reviews_df = clean_reviews(df)
            sentences_df =  make_sentences(reviews_df)
            sentences_df.to_csv(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv')
        sentences_df.book_id = sentences_df.book_id.astype(int)
        return sentences_df

    else:
        if Path(filepath + 'app_data/author/' + author + '.csv').is_file():
            sentences_df = pd.read_csv(filepath + 'app_data/author/' + author + '.csv').drop('Unnamed: 0', axis=1)
        else:
            reviews_df = clean_reviews(df)
            sentences_df =  make_sentences(reviews_df)
            sentences_df.to_csv(filepath + 'app_data/author/' + author + '.csv')
        sentences_df.book_id = sentences_df.book_id.astype(int)

        return sentences_df
def clean_reviews(df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Define spoiler marker & remove from all reviews
    spoiler_str_ucsd = '\*\* spoiler alert \*\* \n'
    df['review_text'] = df['review_text'].str.replace(spoiler_str_ucsd, '')

    # Replace all new line characters
    df['review_text'] = df['review_text'].str.replace('\n', ' ')

    # Append space to all sentence end characters
    df['review_text'] = df['review_text'].str.replace('.', '. ').replace('!', '! ').replace('?', '? ')

    # Initialize dataframe to store English-language reviews
    reviews_df = pd.DataFrame()

    # Loop through each row in dataframe
    for i in range(len(df)):

        # Save review to variable
        review = df.iloc[i]['review_text']

        # Check if review is English
        if detect(review) == 'en':
            # If so, add row to English-language dataframe
            reviews_df = reviews_df.append(df.iloc[i, :])

    reviews_df.book_id = reviews_df.book_id.astype(int)
    return reviews_df


def make_sentences(reviews_df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Initialize dataframe to store review sentences, and counter
    sentences_df = pd.DataFrame()
    ctr = 0


    # Loop through each review
    for i in range(len(reviews_df)):

        # Save row and review to variables
        row = reviews_df.iloc[i]
        review = row.loc['review_text']

        # Tokenize review into sentences
        sentences = sent_tokenize(review)

        # Loop through each sentence in list of tokenized sentences
        for sentence in sentences:
            # Add row for sentence to sentences dataframe
            new_row = row.copy()
            new_row.at['review_text'] = sentence
            sentences_df = sentences_df.append(new_row, ignore_index=True)
        ctr += 1
    sentences_df = sentences_df[(sentences_df.review_text.str.len() >= 20) & (sentences_df.review_text.str.len() <= 350)]
    return sentences_df

def embedInputs(books_df, review_df, search_param, review_max_len, searchTitle=True):
    '''
    Converts input reviews into USE arrays. Returns vectorized reviews for the book that was
    passed as bookTitle.

    Args:
        search_param = List of book titles or author names whose reviews we want to embed. For authors, see 'searchTitle' argument
        books_df = DataFrame with book_id information
        review_df = DataFrame with book_id and review text
        searchTitle = If True, will search for book_id based on title of a book. If False, it will look for author names to find book_id.
    '''
    if searchTitle:
        #Finds book_id from title of book
        input_book_id = books_df[books_df.title.isin([search_param])].book_id.tolist()
        author_name = 'VariousAuthors/'
    else:
        # Finds book_id from author name
        input_book_id = books_df[books_df.name.isin([search_param])].book_id.tolist()
        author_name = books_df[books_df.name.isin([search_param])].name.iloc[0]

    # Finds reviews for specified book
    # input_sentences = review_df[review_df.book_id.isin(input_book_id)].review_text
    input_sentences = cleanAndTokenize(review_df[review_df.book_id.isin(input_book_id)], tokenizedData, searchTitle=searchTitle, author=author_name).review_text

    # Filters review length
    input_sentences = input_sentences[input_sentences.str.len() <= review_max_len]

    # Converts reviews into 512-dimensional arrays
    input_vectors = embed(input_sentences)

    # Returns reviews and vectorized reviews for a particular book
    return input_sentences, input_vectors

In [40]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [41]:
model = getClusters(input_vectors, 8)
indices, inner_product = showClusters(input_sentences, input_vectors, 'Children of Time', 8, 8, model, searchTitle=True)

[t-SNE] Computing 141 nearest neighbors...
[t-SNE] Indexed 142 samples in 0.001s...
[t-SNE] Computed neighbors for 142 samples in 0.007s...
[t-SNE] Computed conditional probabilities for sample 142 / 142
[t-SNE] Mean sigma: 0.474998
[t-SNE] KL divergence after 250 iterations with early exaggeration: 131.341644
[t-SNE] KL divergence after 5000 iterations: 1.313684
Opinion clusters about *Children of Time*


ValueError: shapes (3,) and (512,142) not aligned: 3 (dim 0) != 512 (dim 0)

In [52]:
inner_product.argmin()

287

In [2]:
# Paths to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/'

# Stores tokenized reviews so they only need to be processed the first time that particular book is called
tokenizedData = '/media/einhard/Seagate Expansion Drive/3380_data/data/'
books_file = 'Filtered books/clean_filtered_books.csv'
reviews_file = 'Filtered books/clean_filtered_reviews.csv'
reviewsAll_file = 'Filtered books/reviews_for_cluster.csv'

# Loading DataFrames
books, reviews, reviewsAll = dataLoader(datapath, books_file, reviews_file, reviewsAll_file)

# Loadding pre-trained embeddings and embedder for input sentences
embed, reviews_array, descriptions_array = loadEmbeddings()

# Setting base URL for Goodreads
goodreadsURL = 'https://www.goodreads.com/book/show/'

2020-12-01 02:27:05.991 INFO    absl: resolver HttpCompressedFileResolver does not support the provided handle.
2020-12-01 02:27:05.992 INFO    absl: resolver GcsCompressedFileResolver does not support the provided handle.
2020-12-01 02:27:05.992 INFO    absl: resolver HttpUncompressedFileResolver does not support the provided handle.


In [6]:
books.sample(1).title.tolist()

['Hope for the Flowers']

In [4]:
descriptions_array.shape

(12362, 512)

In [9]:
books.description

0        "It's not that I don't love you, and my tears ...
1        From #1 New York Times bestselling author Bran...
2                                                         
3        The Moonbeam Award Gold Medal Winner in the re...
4        Dark schemes unfold--and Sophie's loyalty is p...
                               ...                        
12357                                                     
12358    Press 1 for technical support.  Press 2 for br...
12359    Julie Powell thought cooking her way through J...
12360    Another biting satire from Emma McLaughlin and...
12361    Avant que nous allions plus loin, mettons les ...
Name: description, Length: 12362, dtype: object

In [3]:
def clean_reviews(df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Define spoiler marker & remove from all reviews
    spoiler_str_ucsd = '\*\* spoiler alert \*\* \n'
    df['semanticSearch'] = df['semanticSearch'].str.replace(spoiler_str_ucsd, '')

    # Replace all new line characters
    df['semanticSearch'] = df['semanticSearch'].str.replace('\n', ' ')

    # Append space to all sentence end characters
    df['semanticSearch'] = df['semanticSearch'].str.replace('.', '. ').replace('!', '! ').replace('?', '? ')

    # Initialize dataframe to store English-language reviews
    semanticSearch_df = pd.DataFrame()

    # Loop through each row in dataframe
    for i in range(len(df)):

        # Save review to variable
        semanticSearch = df.iloc[i]['semanticSearch']

        # Check if review is English
        if detect(semanticSearch) == 'en':
            # If so, add row to English-language dataframe
            semanticSearch_df = semanticSearch_df.append(df.iloc[i, :])

    semanticSearch_df.book_id = semanticSearch_df.book_id.astype(int)
    return semanticSearch_df


def make_sentences(reviews_df):

    # Initialize dataframe to store review sentences, and counter
    sentences_df = pd.DataFrame()
    ctr = 0

    print(f'Starting tokenization')

    # Loop through each review
    for i in range(len(reviews_df)):

        # Save row and review to variables
        row = reviews_df.iloc[i]
        review = row.loc['semanticSearch']

        # Tokenize review into sentences
        sentences = sent_tokenize(review)

        # Loop through each sentence in list of tokenized sentences
        for sentence in sentences:
            # Add row for sentence to sentences dataframe
            new_row = row.copy()
            new_row.at['semanticSearch'] = sentence
            sentences_df = sentences_df.append(new_row, ignore_index=True)

        ctr += 1
        if (ctr % 500 == 0):
            print(f'{ctr} reviews tokenized')

    print(f'Tokenization complete: {len(sentences_df)} sentences tokenized\n')

    # Rename review column
    sentences_df.rename(columns={'review':'sentence'}, inplace=True)

    return sentences_df

In [4]:
books['semanticSearch'] = books.title + ". " + books.description + ". " + books.name

In [5]:
descriptionsClean = clean_reviews(books)

In [6]:
descriptionsClean

Unnamed: 0,book_id,description,name,semanticSearch,title,weighted_score
0,31675691,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,"Poet Of The Wrong Generation. ""It's not that ...",Poet Of The Wrong Generation,4.805233
1,17332218,From #1 New York Times bestselling author Bran...,Brandon Sanderson,"Words of Radiance (The Stormlight Archive, #2)...","Words of Radiance (The Stormlight Archive, #2)",4.769454
2,95602,,Francine Rivers,Mark of the Lion Trilogy. . Francine Rivers,Mark of the Lion Trilogy,4.755426
3,165068,The Moonbeam Award Gold Medal Winner in the re...,Sally Lloyd-Jones,The Jesus Storybook Bible: Every Story Whisper...,The Jesus Storybook Bible: Every Story Whisper...,4.743188
4,27272698,Dark schemes unfold--and Sophie's loyalty is p...,Shannon Messenger,"Lodestar (Keeper of the Lost Cities, #5). Dar...","Lodestar (Keeper of the Lost Cities, #5)",4.742152
...,...,...,...,...,...,...
12356,9817,It is the morning after the Academy Awards. Ma...,Jane Smiley,Ten Days in the Hills. It is the morning afte...,Ten Days in the Hills,2.547730
12357,6017367,,Christopher Ransom,The Birthing House. . Christopher Ransom,The Birthing House,2.476325
12358,105578,Press 1 for technical support.\nPress 2 for br...,Chetan Bhagat,One Night at the Call Center. Press 1 for tec...,One Night at the Call Center,2.471741
12359,6072179,Julie Powell thought cooking her way through J...,Julie Powell,"Cleaving: A Story of Marriage, Meat, and Obses...","Cleaving: A Story of Marriage, Meat, and Obses...",2.440292


In [7]:
description_sentences = make_sentences(descriptionsClean)

Starting tokenization
500 reviews tokenized
1000 reviews tokenized
1500 reviews tokenized
2000 reviews tokenized
2500 reviews tokenized
3000 reviews tokenized
3500 reviews tokenized
4000 reviews tokenized
4500 reviews tokenized
5000 reviews tokenized
5500 reviews tokenized
6000 reviews tokenized
6500 reviews tokenized
7000 reviews tokenized
7500 reviews tokenized
8000 reviews tokenized
8500 reviews tokenized
9000 reviews tokenized
9500 reviews tokenized
10000 reviews tokenized
10500 reviews tokenized
11000 reviews tokenized
11500 reviews tokenized
12000 reviews tokenized
Tokenization complete: 155710 sentences tokenized



In [8]:
import pandas as pd
description_sentences.to_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/Filtered books/description_sentences.csv')

In [14]:
descriptions_array = embed(description_sentences.semanticSearch)

In [13]:
description_sentences

Unnamed: 0,book_id,description,name,semanticSearch,title,weighted_score
0,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,Poet Of The Wrong Generation.,Poet Of The Wrong Generation,4.805233
1,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,"""It's not that I don't love you, and my tears ...",Poet Of The Wrong Generation,4.805233
2,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,But you can't go back and forth forever and we...,Poet Of The Wrong Generation,4.805233
3,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,"Through these words, a young poet unearths his...",Poet Of The Wrong Generation,4.805233
4,31675691.0,"""It's not that I don't love you, and my tears ...",Lonnie Ostrow,"Unknowingly, in writing this ballad of liberat...",Poet Of The Wrong Generation,4.805233
...,...,...,...,...,...,...
155705,33993.0,Another biting satire from Emma McLaughlin and...,Emma McLaughlin,whatever that may be.,Citizen Girl,2.404365
155706,33993.0,Another biting satire from Emma McLaughlin and...,Emma McLaughlin,"Sharply observed and devastatingly funny, Citi...",Citizen Girl,2.404365
155707,33993.0,Another biting satire from Emma McLaughlin and...,Emma McLaughlin,"A personal glimpse into an impersonal world, C...",Citizen Girl,2.404365
155708,33993.0,Another biting satire from Emma McLaughlin and...,Emma McLaughlin,.,Citizen Girl,2.404365


In [None]:
import joblib

In [16]:
descriptions_array.shape

TensorShape([155710, 512])

In [17]:
descriptions_array = np.asarray(descriptions_array)
joblib.dump(descriptions_array, '/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionsTokeniziedEmbedding.pkl')

['/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionsTokeniziedEmbedding.pkl']

In [28]:
descriptions_array

array([[ 0.00887228, -0.01839503, -0.01641959, ...,  0.01465133,
        -0.05207071,  0.00717109],
       [-0.00365145, -0.04039939, -0.0474895 , ...,  0.01138781,
        -0.00453208,  0.01978063],
       [ 0.04340677,  0.0641699 ,  0.0789528 , ..., -0.03263206,
        -0.00285275,  0.06097573],
       ...,
       [-0.02484386,  0.00409283, -0.00358204, ...,  0.00670385,
         0.02625612,  0.00314738],
       [-0.00239373, -0.07757329, -0.03650055, ..., -0.03009087,
        -0.06383237,  0.04501955],
       [-0.02961154,  0.02664398, -0.01841117, ..., -0.05961334,
         0.0425594 ,  0.03041534]], dtype=float32)

In [35]:
joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionsTokeniziedEmbedding.pkl')

array([[ 0.00887228, -0.01839503, -0.01641959, ...,  0.01465133,
        -0.05207071,  0.00717109],
       [-0.00365145, -0.04039939, -0.0474895 , ...,  0.01138781,
        -0.00453208,  0.01978063],
       [ 0.04340677,  0.0641699 ,  0.0789528 , ..., -0.03263206,
        -0.00285275,  0.06097573],
       ...,
       [-0.02484386,  0.00409283, -0.00358204, ...,  0.00670385,
         0.02625612,  0.00314738],
       [-0.00239373, -0.07757329, -0.03650055, ..., -0.03009087,
        -0.06383237,  0.04501955],
       [-0.02961154,  0.02664398, -0.01841117, ..., -0.05961334,
         0.0425594 ,  0.03041534]], dtype=float32)

In [220]:
def semanticSearch(input_text):
    query = embed(['Dune'])
    result = np.inner(descriptions_array, query)
    iloc_desc = pd.DataFrame(result).sort_values(0, ascending=False).reset_index().rename({'index':'position'}, axis=1)['position'].tolist()
    # iloc_books = description_sentences.iloc[iloc_desc][:10].book_id.tolist()
    book_title = description_sentences.iloc[iloc_desc][:10].title.tolist()
    return book_title

Unnamed: 0,title,description,book_id,weighted_score,name
2950,"Star Wars: Jedi Academy (Jedi Academy, #1)",New York Times bestselling author/illustrator ...,17265216,4.207628,Jeffrey Brown
3023,Dune (Dune Chronicles #1),This book was mistakenly published under . ...,234225,4.199971,Frank Herbert
3247,"The Rise of Nine (Lorien Legacies, #3)",The third book of the #1 New York Timesbestsel...,12971616,4.189812,Pittacus Lore
8182,"The Revolution Was Televised: The Cops, Crooks...","A mob boss in therapy. An experimental, viol...",16137527,3.98886,Alan Sepinwall
8742,"Velvet (Velvet, #1)","Equal parts steamy and funny, with a few genui...",22561320,3.968322,Temple West
10352,Chapterhouse: Dune (Dune Chronicles #6),"The desert planet Arrakis, called Dune, has be...",105,3.890035,Frank Herbert
10962,Emperor Mollusk versus The Sinister Brain,Emperor Mollusk. Intergalactic Menace. Des...,10979852,3.791754,A. Lee Martinez
11372,"The Dark Hills Divide (The Land of Elyon, #1)",The Dark Hills Divideintroduces readers to Ale...,81524,3.720498,Patrick Carman
11463,House Atreides (Prelude to Dune #1),Frank Herbert's award-winning Dunechronicles c...,761575,3.700633,Brian Herbert
12308,Aftermath (Star Wars: Aftermath #1),Journey to The Force Awakens. The second Dea...,25134015,3.178462,Chuck Wendig


In [226]:
books

Unnamed: 0,title,description,book_id,weighted_score,name,semanticSearch
0,Poet Of The Wrong Generation,"""It's not that I don't love you, and my tears ...",31675691,4.805233,Lonnie Ostrow,"Poet Of The Wrong Generation ""It's not that I ..."
1,"Words of Radiance (The Stormlight Archive, #2)",From #1 New York Times bestselling author Bran...,17332218,4.769454,Brandon Sanderson,"Words of Radiance (The Stormlight Archive, #2)..."
2,Mark of the Lion Trilogy,en ingles la maquina no funciona,95602,4.755426,Francine Rivers,Mark of the Lion Trilogy en ingles la maquina ...
3,The Jesus Storybook Bible: Every Story Whisper...,The Moonbeam Award Gold Medal Winner in the re...,165068,4.743188,Sally Lloyd-Jones,The Jesus Storybook Bible: Every Story Whisper...
4,"Lodestar (Keeper of the Lost Cities, #5)",Dark schemes unfold--and Sophie's loyalty is p...,27272698,4.742152,Shannon Messenger,"Lodestar (Keeper of the Lost Cities, #5) Dark ..."
...,...,...,...,...,...,...
12357,The Birthing House,en ingles la maquina no funciona,6017367,2.476325,Christopher Ransom,The Birthing House en ingles la maquina no fun...
12358,One Night at the Call Center,Press 1 for technical support. Press 2 for b...,105578,2.471741,Chetan Bhagat,One Night at the Call Center Press 1 for techn...
12359,"Cleaving: A Story of Marriage, Meat, and Obses...",Julie Powell thought cooking her way through J...,6072179,2.440292,Julie Powell,"Cleaving: A Story of Marriage, Meat, and Obses..."
12360,Citizen Girl,Another biting satire from Emma McLaughlin and...,33993,2.404365,Emma McLaughlin,Citizen Girl Another biting satire from Emma M...


In [214]:
iloc_books

[105.0,
 104.0,
 53732.0,
 6736971.0,
 7031835.0,
 77504.0,
 24358527.0,
 2836109.0,
 24819482.0,
 28483931.0]

TypeError: 'list' object is not callable

In [151]:
test = description_sentences.copy()

In [153]:
test.merge(pd.DataFrame(result.T).rename({0:'sim_score'}, axis=1), left_index=True, right_index=True).sort_values('sim_score', ascending=False).description.tolist()

['I will show you then that I have always been there.',
 'They plan to share the occasion together at a summerhouse in Maine, but none of them expects the gift that awaits them there - a chance to turn their "what-might-have-beens" into reality.',
 'Yet Malachy--exasperating, irresponsible, and beguiling--does nurture in Frank an appetite for the one thing he can provide: a story.',
 'An enquiring email from a potential client finds him intrigued about the shy, awkward woman.',
 'Complete your Divergent library with the Four stories!',
 "In Darmik's quest to hunt down and kill the threat, he discovers that nothing is as it seems.",
 'and so begins this epic tale of treachery and adventure, sword fighting and magic, myth and legend.',
 "'Every second of every minute of every hour of every day.",
 'Once she qualified to train a dog of her own, she adopted Puzzle, a strong, bright Golden Retriever puppy who exhibited unique aptitudes as a working dog but who was less interested in the rol

In [131]:
books[books.book_id == 88459].description.tolist()

["Justice Lonesome has enjoyed a life of bounty.   Even so, she's inherited the curse of the Lonesome.   A poet's soul.   Which means she's still searching for something.   Searching for peace.   Searching for the less.  .  .  that's more.   And when the foundation of her life is pulled out from under her, grieving, she goes to the mountains to find her oasis.   She hits Carnal, Colorado and decides to stay.   Deke Hightower lost everything at the age of two.   He lost it again at fifteen.   His life has not been about bounty.   It's been about learning to live with less, because there's no way to get more.   Deke's also watched all his friends go down to the women who gave them what they needed.   He wants that for himself.   But he knows that search isn't going to be easy because he's a rider.   His home is the road.   That's the only place he can breathe.   And the woman who takes her place at his side has to do it sitting on the back of his bike.   When Deke meets Justice, he knows

In [51]:
df = pd.DataFrame(columns=['A'])

for i in range(5):

    df = df.append({'A': i}, ignore_index=True)

# Testing full pipeline

In [7]:
import tensorflow_hub as hub
from langdetect import detect
from pathlib import Path
from nltk.tokenize import sent_tokenize

In [4]:
input_text = 'Children of Time'
book_title = books[books.title.str.contains(input_text, case=False)].title.tolist()
input_book_id = books[books.title.isin(book_title)].book_id.tolist()

In [5]:
input_book_id

[25499718]

In [6]:
reviewsAll[reviewsAll.book_id.isin([input_book_id])]



input_sentences = cleanAndTokenize(reviewsAll[reviewsAll.book_id.isin(input_book_id)], tokenizedData, searchTitle=True, author='Adrian Tchaikovsky').review_text

# Converts reviews into 512-dimensional arrays
input_vectors = embed(input_sentences)
# Returns reviews and vectorized reviews for a particular book

In [21]:
input_sentences

0                                 Really liked this one.
1                Fantastic and original science fiction.
2                            Setting the stage for more.
3                            I really enjoyed this book.
4      I read a lot of sci-fi and this was up there w...
                             ...                        
137    Just about the right balance between science a...
138    Yesterday I had to break a tiny web that had a...
139                    "I'm sorry, Portia," I whispered.
140    I thought the book dragged a bit in the middle...
141         It just pushed all the right buttons for me.
Name: review_text, Length: 142, dtype: object

In [22]:
input_vectors

<tf.Tensor: shape=(142, 512), dtype=float32, numpy=
array([[-0.03793607,  0.04623649, -0.00505508, ..., -0.04547682,
         0.00827182, -0.00052418],
       [-0.04669672,  0.03274052, -0.07346593, ..., -0.04710089,
         0.09411614,  0.04661921],
       [ 0.02117739, -0.02780496,  0.01408575, ..., -0.00671999,
         0.07057461,  0.01192718],
       ...,
       [-0.02475809,  0.023139  ,  0.006002  , ...,  0.00268129,
         0.08098436,  0.0375176 ],
       [-0.04929401, -0.05269561,  0.05049564, ..., -0.03259405,
         0.11280895,  0.03305841],
       [-0.0424552 ,  0.00778071,  0.0133794 , ..., -0.00455391,
        -0.00720548, -0.03733094]], dtype=float32)>

Opinion clusters about *Children of Time*
EUCLEDIAN DISTANCES 
 [[0.97986721 0.6898791  1.17845609 ... 1.13109286 0.93589187 1.15141068]
 [1.06466306 1.05215896 1.15893397 ... 1.10636515 1.01618467 1.10651268]
 [0.81628725 0.96221752 1.19189276 ... 1.15539174 0.95476636 1.13788915]
 ...
 [0.95494688 1.01326106 1.16443593 ... 1.15406938 0.87793586 1.14085719]
 [1.05721386 0.80401717 1.07956127 ... 1.11902202 1.06704623 1.12727826]
 [0.91293835 0.97519208 1.03292782 ... 1.05985123 0.82227564 0.91017246]]


Exception: Data must be 1-dimensional

In [25]:
input_sentences, input_vectors = embedInputs(books,
                                            reviewsAll,
                                            search_param=info.title.tolist()[0],
                                            review_max_len=review_max_len,
                                            searchTitle=True)

NameError: name 'info' is not defined

In [27]:
from sklearn.metrics.pairwise import euclidean_distances

# Tokenizing reviews into sentences

In [3]:
from nltk.tokenize import sent_tokenizea

In [15]:
search_param = books[books.title == 'Children of Time'].name.tolist()[0]
books[books.name.isin([search_param])].name.iloc[0]

'Adrian Tchaikovsky'

In [5]:
books[books.title.str.contains("Harry Potter and the Sorcerer's Stone")]

Unnamed: 0,title,description,book_id,weighted_score,name
309,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,3,4.449994,J.K. Rowling
329,Harry Potter and the Sorcerer's Stone (Harry P...,An alternative cover for this ASIN can be foun...,28132722,4.448603,J.K. Rowling
342,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter thinks he is an ordinary boy - un...,13562891,4.446682,J.K. Rowling


In [8]:
reviewsAll[reviewsAll.book_id.isin([3])]

Unnamed: 0,book_id,review_text
169,3,Tuve el gusto de leerlo antes de que saliera l...
214,3,I remember trying 3 times to read this but I a...
1246,3,this was not the best harry potter book
1793,3,"I love the Harry Potter series, even though th..."
4229,3,One of the best and most magical children's bo...
...,...,...
1391054,3,decent wizarding book sequels are better
1391095,3,"Just started this, but a little hesitant to ge..."
1391392,3,Good...not as exciting as the others though...
1391659,3,The start of Harry's adventure in Hogwarts. My...


# Keeping only English

In [92]:
from langdetect import detect
from pathlib import Path

In [21]:
def clean_reviews(df):

    # Read in CSV as dataframe
    length_orig = len(df)

    # Drop duplicates
    df.drop_duplicates(inplace=True)
    num_dups = length_orig - len(df)

    print(f'Read in {length_orig} reviews, dropping {num_dups} duplicates\n')

    # Define spoiler marker & remove from all reviews
    spoiler_str_ucsd = '\*\* spoiler alert \*\* \n'
    df['review_text'] = df['review_text'].str.replace(spoiler_str_ucsd, '')

    # Replace all new line characters
    df['review_text'] = df['review_text'].str.replace('\n', ' ')

    # Append space to all sentence end characters
    df['review_text'] = df['review_text'].str.replace('.', '. ').replace('!', '! ').replace('?', '? ')

    # Initialize dataframe to store English-language reviews
    reviews_df = pd.DataFrame()
    # Initialize counter for dropped reviews
    drop_ctr = 0

    # Loop through each row in dataframe
    for i in range(len(df)):

        # Save review to variable
        review = df.iloc[i]['review_text']

        # Check if review is English
        try:
            if detect(review) == 'en':
                # If so, add row to English-language dataframe
                reviews_df = reviews_df.append(df.iloc[i, :])
            else:
                # If not, add 1 to dropped review counter
                drop_ctr += 1
        # If check fails, add 1 to dropped review counter
        except:
            drop_ctr += 1

    reviews_df.book_id = reviews_df.book_id.astype(int)

    print(f'Dropped {drop_ctr} non-English reviews. '
          f'{len(reviews_df)} reviews remain.\n')

    return reviews_df

In [67]:
clean_reviews(reviewsAll[reviewsAll.book_id.isin([3])], 30, 100)

Read in 3562 reviews, dropping 0 duplicates

Dropped 189 non-English reviews. 1480 reviews remain.



Unnamed: 0,book_id,review_text
1246,3,this was not the best harry potter book
5361,3,The beginning of an inspiring literary traditi...
5573,3,So much love for JK Rowling and her harry pott...
6964,3,"An excellent book, great for kids, not sure I'..."
7329,3,Love it as well as all the Harry Potter series.
...,...,...
1389325,3,I love the Harry Potter series.
1390184,3,I enjoyed Reading this book again. Will conti...
1390717,3,a beautiful adventure wanna be with harry po...
1390777,3,Sums up a child's experiences going into the r...


In [22]:
def make_sentences(reviews_df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Initialize dataframe to store review sentences, and counter
    sentences_df = pd.DataFrame()
    ctr = 0

    print(f'Starting tokenization')

    # Loop through each review
    for i in range(len(reviews_df)):

        # Save row and review to variables
        row = reviews_df.iloc[i]
        review = row.loc['review_text']

        # Tokenize review into sentences
        sentences = sent_tokenize(review)

        # Loop through each sentence in list of tokenized sentences
        for sentence in sentences:
            # Add row for sentence to sentences dataframe
            new_row = row.copy()
            new_row.at['review_text'] = sentence
            sentences_df = sentences_df.append(new_row, ignore_index=True)

        ctr += 1
        if (ctr % 500 == 0):
            print(f'{ctr} reviews tokenized')

    sentences_df = sentences_df[(sentences_df.review_text.str.len() > 20) & (sentences_df.review_text.str.len() < 160)]
    print(f'Tokenization complete: {len(sentences_df)} sentences tokenized\n')

    return sentences_df

In [23]:
tokenizedData = '/media/einhard/Seagate Expansion Drive/3380_data/data/app_data/'
def cleanAndTokenize(df, filepath):
    if Path(filepath + df.book_id.iloc[1].astype(str) + '.csv').is_file():
        sentences_df = pd.read_csv(filepath + df.book_id.iloc[1].astype(str) + '.csv').drop('Unnamed: 0', axis=1)
    else:
        reviews_df = clean_reviews(df)
        sentences_df =  make_sentences(reviews_df)
        sentences_df.to_csv(filepath + df.book_id.iloc[1].astype(str) + '.csv')
    sentences_df.book_id = sentences_df.book_id.astype(int)
    return sentences_df


In [105]:
cleanAndTokenize(reviewsAll[reviewsAll.book_id.isin([3])], filepath)

Unnamed: 0,book_id,review_text
0,3,I remember trying 3 times to read this but I a...
1,3,So I skipped to book 2 and then read the whole...
2,3,It's hard to review this book without taking i...
3,3,this was not the best harry potter book
4,3,"I love the Harry Potter series, even though th..."
...,...,...
6915,3,I love the movies and am looking forward to ge...
6916,3,Twi-hard/Potterhead in the making baby!
6917,3,not as exciting as the others though.
6918,3,The start of Harry's adventure in Hogwarts.


In [76]:
import time
startTime = time.time()
reviewsCleaned = clean_reviews(reviewsAll[reviewsAll.book_id.isin([3])])
reviewsTokenized = make_sentences(reviewsCleaned, 20, 160)


reviewsEmbedded = embed(reviewsTokenized.review_text)
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Read in 3562 reviews, dropping 0 duplicates

Dropped 187 non-English reviews. 3375 reviews remain.

Starting tokenization
500 reviews tokenized
1000 reviews tokenized
1500 reviews tokenized
2000 reviews tokenized
2500 reviews tokenized
3000 reviews tokenized
Tokenization complete: 7802 sentences tokenized

Execution time in seconds: 39.682156562805176


In [88]:
print(filepath + reviewsAll[reviewsAll.book_id.isin([3])].book_id.iloc[1].astype(str) + '.csv')

/media/einhard/Seagate Expansion Drive/3380_data/data/app_data/3.csv


In [55]:
reviewsTokenized.book_id = reviewsTokenized.book_id.astype(int)

In [87]:
reviewsAll[reviewsAll.book_id.isin([3])].book_id.iloc[1].astype(str)

'3'

["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"]

In [49]:
startTime = time.time()
embed(reviewsTokenized.review_text)
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 0.8115620613098145


# Testing spaCy

In [46]:
from spacy.pipeline import Sentencizer

In [45]:
reviewsTokenized

Unnamed: 0,book_id,review_text
0,3.0,Tuve el gusto de leerlo antes de que saliera l...
1,3.0,I remember trying 3 times to read this but I a...
2,3.0,So I skipped to book 2 and then read the whole...
3,3.0,It's hard to review this book without taking i...
4,3.0,this was not the best harry potter book
...,...,...
8625,3.0,Good...not as exciting as the others though...
8626,3.0,The start of Harry's adventure in Hogwarts.
8627,3.0,My most favorite book in the series!
8628,3.0,WINGARDIUM LEVIOSA!


In [None]:
reviewsTokenized.to_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/tokenized sentences/reviewsTokenized.csv')

# Experimental

In [3]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow_hub as hub
from langdetect import detect
from pathlib import Path
from nltk.tokenize import sent_tokenize

# To create recommendations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# To create sentence clusters
from sklearn.cluster import KMeans

# To load saved embeddings
import joblib

# To match strings
import re

# To create webapp
import psutil

In [4]:
# Experimental
def findSemanticallySimilarReviews(query,reviews, books, sentence_array,  n_books):
    # Create vector from query and compare with global embedding
    sentence = [query]
    sentence_vector = np.array(embed(sentence))
    inner_product = np.inner(sentence_vector, sentence_array)[0]

    # Find sentences with highest inner products
    top_n_sentences = pd.Series(inner_product).nlargest(n_books+1)
    top_n_indices = top_n_sentences.index.tolist()
    review_index = reviews.iloc[top_n_indices].index

    books_beta = books[books.book_id.isin(reviews.iloc[review_index].book_id)].sort_values('book_id')
    reviews_beta = reviews.iloc[review_index].sort_values('book_id')

    return books_beta, reviews_beta

In [5]:
n_result = 9

In [6]:
books_, reviews_ = findSemanticallySimilarReviews(query='Charlemagne',
                                               reviews=reviewsAll,
                                               books=books,
                                               n_books=10,
                                               sentence_array=reviews_array)

In [7]:
reviews_

Unnamed: 0,book_id,review_text
211406,28187,A reread. (I'm going to be rereading this seri...
146200,34498,"listening as always, stephen briggs is amazing"
324717,77203,This is a very emotional read. It was difficul...
420175,99561,"Great book! Very well written, so many good li..."
99869,110392,I liked Dunford I even liked Henry but the rus...
186002,7260188,I liked the first two books of the trilogy bet...
124971,7992995,Enjoyed the book despite the cover. Definitely...
40079,12218678,I did not care for this novel. I lost all resp...
167426,12408238,I needed a soothing book after reading Lords o...
360406,15717876,Tried a sample on amazon. It was too corny and...


# Topic Modelling

In [8]:
import sys
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', ])


In [12]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)

def process_words(texts,bigram_mod, trigram_mod, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

def format_topics_sentences(corpus, texts, ldamodel=None,):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

def findTopic(reviewCluster):

    # Convert to list
    #data = reviewCluster.values.tolist()
    data_words = list(sent_to_words(reviewCluster))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_ready = process_words(data_words, bigram_mod=bigram_mod,trigram_mod=trigram_mod)
    id2word = corpora.Dictionary(data_ready)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_ready]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=4, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=10,
                                               passes=10,
                                               alpha='symmetric',
                                               iterations=100,
                                               per_word_topics=True)
    df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']


    sent_topics_sorteddf_mallet = pd.DataFrame()
    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                                 grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                                axis=0)

    # Reset Index    
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

    # Format
    sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

    # Show
    return sent_topics_sorteddf_mallet['Representative Text'].tolist()[0]

In [13]:
findTopic(reviews_['review_text'])

['enjoy',
 'book',
 'cover',
 'definitely',
 'triumph',
 'evil',
 'theme',
 'like',
 'little',
 'strong',
 'female',
 'heroine']