In [7]:
import numpy as np
import pandas as pd

# To process embeddings
import tensorflow_hub as hub
from langdetect import detect
from pathlib import Path
from nltk.tokenize import sent_tokenize

# To create recommendations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# To create sentence clusters
from sklearn.cluster import KMeans

# To load saved embeddings
import joblib

# To match strings
import re

# To create webapp
import psutil
import streamlit as st
from streamlit import caching


# To sort final recommendation list
from collections import Counter


def dataLoader(datapath, books_file, reviewsAll_file, tokenized_descriptions_file):
    '''
    Loads DataFrames with books and review information

    Args:
        datapath = Path/to/directory/with/data
        books_file = name of CSV with book metadata
        reviewsALL_file = name of CSV with filtered reviews --> Generally, this is the file that will be used in the app.
    '''
    books = pd.read_csv(datapath + books_file).drop('Unnamed: 0', axis=1).fillna('')
    reviewsAll = pd.read_csv(datapath + reviewsAll_file).drop('Unnamed: 0', axis=1)
    description_sentences = pd.read_csv(datapath + tokenized_descriptions_file)
    return books, reviewsAll, description_sentences


def loadEmbeddings(datapath):
    '''
    Loads pre-trained sentence and review arrays

    Args:
        datapath = Path/to/directory/with/data
    '''
    # Path to USE
    embed = hub.load(datapath + 'tensorflow_hub/universal-sentence-encoder_4')

    # Load pre-trained sentence arrays
    ## Reviews array is a set of embeddings trained on review lengths of < 90 characters
    #reviews_array = joblib.load(datapath + 'Models/reviewEmbeddings.pkl')
    # Descriptions array is a set of embeddings trained on all book descriptions
    descriptions_array = joblib.load('/media/einhard/Seagate Expansion Drive/3380_data/data/Models/descriptionsTokeniziedEmbedding.pkl')

    return embed, descriptions_array # reviews_array,

#################### Basic Clustering Functionality ####################


def embedInputs(books_df, review_df, search_param, review_max_len, searchTitle=True):
    '''
    Converts input reviews into USE arrays. Returns vectorized reviews for the book that was
    passed as bookTitle.

    Args:
        search_param = List of book titles or author names whose reviews we want to embed. For authors, see 'searchTitle' argument
        books_df = DataFrame with book_id information
        review_df = DataFrame with book_id and review text
        searchTitle = If True, will search for book_id based on title of a book. If False, it will look for author names to find book_id.
    '''
    if searchTitle:
        #Finds book_id from title of book
        input_book_id = books_df[books_df.title.isin([search_param])].book_id.tolist()
        author_name = 'VariousAuthors/'
    else:
        # Finds book_id from author name
        input_book_id = books_df[books_df.name.isin([search_param])].book_id.tolist()
        author_name = books_df[books_df.name.isin([search_param])].name.iloc[0]

    # Finds reviews for specified book
    # input_sentences = review_df[review_df.book_id.isin(input_book_id)].review_text
    input_sentences = cleanAndTokenize(review_df[review_df.book_id.isin(input_book_id)], tokenizedData, searchTitle=searchTitle, author=author_name).review_text

    # Filters review length
    input_sentences = input_sentences[input_sentences.str.len() <= review_max_len]

    # Converts reviews into 512-dimensional arrays
    input_vectors = embed(input_sentences)

    # Returns reviews and vectorized reviews for a particular book
    return input_sentences, input_vectors

# Pass "sentence_array" as input_vectors to compare with user input???????

def getClusters(input_vectors, n_clusters):
    '''
    Creates KMeans instance and fits model.

    The for nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        n_clusters = How many clusters to generate
    '''
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, algorithm='full')
    return kmeans.fit(input_vectors)


def showClusters(input_sentences, input_vectors, authorTitle, n_clusters, n_results, model, searchTitle=True):
    '''
    This function will find theme clusters in the reviews of a particular book or set of sentences.
    Uses cluster centers to find semantically similar sentences to the input vectors.

    The nested for loop is used to display the returned sentences on Streamlit.

    Args:
        input_sentences =  Sentences to compare
        input_vectors = USE Array generated by embedding input sentences
        authorTitle = Title of book in question, or name of author --> Used to display header only
        n_clusters = How many clusters to generate
        n_results = How many sentences to display per n_cluster
        model = The model used to create the clusters.
    '''
    if searchTitle:
        # Displays which book's reviews are being clustered
        st.header(f'Opinion clusters about *{authorTitle}*')
    else:
        st.header(f'Opinion clusters about {authorTitle}\'s books')

    # Iterates through centroids to and computes inner products to find nlargest
    for i in range(n_clusters):
        centre = model.cluster_centers_[i]
        inner_product = np.inner(centre, input_vectors)
        indices = pd.Series(inner_product).nlargest(n_results).index
        clusteredInputs = list(input_sentences.iloc[indices])

        # Writes reviews that are closest to centroid
        st.markdown('---')
        collapseCluster = st.beta_expander(f'Opinion cluster #{i+1}', expanded=True)
        with collapseCluster:
            for sentence in clusteredInputs:
                st.write(sentence)


#################### Tokenizing and saving data for embedding ####################


def clean_reviews(df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)

    #Fill nans
    df.fillna('la maquina solo puede en ingles', inplace=True)

    # Define spoiler marker & remove from all reviews
    spoiler_str_ucsd = '\*\* spoiler alert \*\* \n'
    df['review_text'] = df['review_text'].str.replace(spoiler_str_ucsd, '')

    # Replace all new line characters
    df['review_text'] = df['review_text'].str.replace('\n', ' ')

    # Append space to all sentence end characters
    df['review_text'] = df['review_text'].str.replace('.', '. ').replace('!', '! ').replace('?', '? ')

    # Initialize dataframe to store English-language reviews
    reviews_df = pd.DataFrame()

    # Loop through each row in dataframe
    for i in range(len(df)):

        # Save review to variable
        review = df.iloc[i]['review_text']
        try:
            # Check if review is English
            if detect(review) == 'en':
                # If so, add row to English-language dataframe
                reviews_df = reviews_df.append(df.iloc[i, :])
        except:
            continue

    reviews_df.book_id = reviews_df.book_id.astype(int)
    return reviews_df


def make_sentences(reviews_df):
    '''
    Copyright (c) 2020 Willie Costello
    '''
    # Initialize dataframe to store review sentences, and counter
    sentences_df = pd.DataFrame()

    # Loop through each review
    for i in range(len(reviews_df)):

        # Save row and review to variables
        row = reviews_df.iloc[i]
        review = row.loc['review_text']

        # Tokenize review into sentences
        sentences = sent_tokenize(review)

        # Loop through each sentence in list of tokenized sentences
        for sentence in sentences:
            # Add row for sentence to sentences dataframe
            new_row = row.copy()
            new_row.at['review_text'] = sentence
            sentences_df = sentences_df.append(new_row, ignore_index=True)

    sentences_df = sentences_df[(sentences_df.review_text.str.len() >= 20) & (sentences_df.review_text.str.len() <= 350)]
    return sentences_df


def cleanAndTokenize(df, filepath, searchTitle, author):
    '''
    Helper function that first checks if a CSV file has already been generated for that book or author.
    If not, runs another cleaning pass on the set of reviews and tokenizes text into sentences before
    saving a CSV file to speed up recalls for the same book alter.

    Returns DataFrame with tokenized reviews for a particular author or book.

    Args:
        df          = dataframe to tokenize. It will save and load based on the first book_id if searching titles, or based on the author passed as an argument.
        filepath    = Path to output folder. If if a file exits, it will look in this path for the CSV files. Subdirectories must be made for ./book_id/ and for ./author/
        searchTitle = Specify whether the reviews for an individual book should be tokenized or the all reviews for an author.
        author      = If searchTitle=False, the "author" argument is used for saving and loading files. Note that it has no effect on the tokenization itself.
    '''

    if searchTitle:
        if Path(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv').is_file():
            sentences_df = pd.read_csv(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv').drop('Unnamed: 0', axis=1)
        else:
            reviews_df = clean_reviews(df.fillna('la maquina no puede en ingles'))
            sentences_df =  make_sentences(reviews_df)
            sentences_df.to_csv(filepath + 'app_data/book_id/' + df.book_id.iloc[1].astype(str) + '.csv')
        sentences_df.book_id = sentences_df.book_id.astype(int)
        return sentences_df

    else:
        if Path(filepath + 'app_data/author/' + author + '.csv').is_file():
            sentences_df = pd.read_csv(filepath + 'app_data/author/' + author + '.csv').drop('Unnamed: 0', axis=1)
        else:
            reviews_df = clean_reviews(df.fillna('la maquina no puede en ingles'))
            sentences_df =  make_sentences(reviews_df)
            sentences_df.to_csv(filepath + 'app_data/author/' + author + '.csv')
        sentences_df.book_id = sentences_df.book_id.astype(int)

        return sentences_df



#################### Searching based on description or review ####################

def findSimilarity(input_text, df, searchDescription):
    pass

def searchBookTitles(input_text, reviews, books, n_clusters, n_cluster_reviews):
    pass

def semanticSearch(input_text, n_books):
    query = embed([input_text])
    result = np.inner(descriptions_array, query)
    iloc_desc = pd.DataFrame(result).sort_values(0, ascending=False).reset_index().rename({'index':'position'}, axis=1)['position'].tolist()
    # iloc_books = description_sentences.iloc[iloc_desc][:10].book_id.tolist()
    book_title = description_sentences.iloc[iloc_desc][:n_books].title.tolist()
    return book_title

# Basic TF-IDF cosine similarity engine

def createSimilarities(books_df):
    '''
    Creates a similarity matrix for book recommendations based on description.
    Returns similarity matrix and mapping for book-finding

    Args:
        books_df = DataFrame with books and descriptions
    '''
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(books['description'])
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
    mapping = pd.Series(books.index,index = books['title'])
    return cosine_similarities, mapping


def bookRecommendation(book_title, mapping, cosine_similarities, n_books):
    '''
    Function to match input book with recommended books using cosine similarity matrix.

    Args:
        book_title          = Input book title. Recommendations will be made based on this.
        mapping             = Mapping of cosine similarity matrix to book indices
        cosine_similarities = Similarity matrix for content based recommendation
        n_books             = How many books to recommend
    '''
    book_index = mapping[book_title]
    similarity_score = list(enumerate(cosine_similarities[book_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    similarity_score = similarity_score[1:n_books+1]
    book_indices = [i[0] for i in similarity_score]
    return (books['title'].iloc[book_indices])


##### Experimental functionality, WIP #####

#
#def findSemanticallySimilarReviews(query,reviews, books, sentence_array,  n_books):
#    # Create vector from query and compare with global embedding
#    sentence = [query]
#    sentence_vector = np.array(embed(sentence))
#    inner_product = np.inner(sentence_vector, sentence_array)[0]
#
#    # Find sentences with highest inner products
#    top_n_sentences = pd.Series(inner_product).nlargest(n_books+1)
#    top_n_indices = top_n_sentences.index.tolist()
#    book_titles = books[books.book_id.isin(reviews.iloc[top_n_indices].book_id.tolist())].title.tolist()
#
#    return book_titles, reviews.iloc[top_n_indices].index



#################### App UI and Interactions ####################


def showInfo(iterator, n_clusters, n_results,n_books, review_max_len=350):
    with results:
        for idx, i in enumerate(iterator[:n_books]):
            try:
                info = books[books.title == i]
                '**---**'
                '**Book title:**', f'*{info.title.tolist()[0]}*'
                '**Author:**', info.name.tolist()[0]
                '**Weighted Score**', str(round(info.weighted_score.tolist()[0], 2)), '/ 5'

                showDescription = st.beta_expander(label='Show description?')
                showReviewClusters = st.button(label='Show opinion clusters for this book?', key=idx)
                showAuthorClusters = st.button(label='Show opinion clusters for this author?', key=idx+100)
            #    showSimilarBooks = st.button(label='Show similar books?', key=idx+555)
            except IndexError:
                break
            with showDescription:
                st.write(info.description.tolist()[0])

            if showReviewClusters:
                with clusters:
                    try:
                        input_sentences, input_vectors = embedInputs(books,
                                                                    reviewsAll,
                                                                    search_param=info.title.tolist()[0],
                                                                    review_max_len=review_max_len,
                                                                    searchTitle=True)
                        model = getClusters(input_vectors=input_vectors,
                                            n_clusters=n_clusters)
                        showClusters(input_sentences=input_sentences,
                                    input_vectors=input_vectors,
                                    authorTitle = info.title.tolist()[0],
                                    n_clusters=n_clusters,
                                    n_results=n_results,
                                    model=model,
                                    searchTitle=True)
                    except ValueError:
                        print(f"It looks like this book doesn't have enough reviews to generate {n_clusters} distinct clusters. Try decreasing how many clusters you look for!")
                        continue
            if showAuthorClusters:
                with clusters:
                    try:
                        input_sentences, input_vectors = embedInputs(books,
                                                                    reviewsAll,
                                                                    search_param=info.name.tolist()[0],
                                                                    review_max_len=review_max_len,
                                                                    searchTitle=False)
                        model = getClusters(input_vectors=input_vectors,
                                            n_clusters=n_clusters)
                        showClusters(input_sentences=input_sentences,
                                    input_vectors=input_vectors,
                                    authorTitle = info.name.tolist()[0],
                                    n_clusters=n_clusters,
                                    n_results=n_results,
                                    model=model,
                                    searchTitle=False)
                    except ValueError:
                        print(f"It looks like this author's books don't have enough reviews to generate {n_clusters} distinct clusters. Try decreasing how many clusters you look for!")
                        continue

            ##### Experimental functionality, WIP #####

            #if showSimilarBooks:
            #    showInfo(iterator=info.title.tolist()[0],
            #            n_clusters=n_clusters,
            #            n_results=n_results,
            #            n_books=n_books,
            #            review_max_len=review_max_len)

            if goodreadsLink:
                good_reads_link = goodreadsURL + info.book_id.astype(str).tolist()[0]
                print(f'*Goodreads Link: {good_reads_link}*')

In [3]:
# Paths to books and reviews DataFrames
datapath = '/media/einhard/Seagate Expansion Drive/3380_data/data/'

# Stores tokenized reviews so they only need to be processed the first time that particular book is called
tokenizedData = '/media/einhard/Seagate Expansion Drive/3380_data/data/'
books_file = 'Filtered books/clean_filtered_books.csv'
reviewsAll_file = 'Filtered books/reviews_for_cluster.csv'
descriptions_tokenized_file = 'Filtered books/description_sentences.csv'

# Loading DataFrames
books, reviewsAll, description_sentences = dataLoader(datapath, books_file, reviewsAll_file, descriptions_tokenized_file)

# Loadding pre-trained embeddings and embedder for input sentences
embed, descriptions_array = loadEmbeddings(datapath) # , reviews_array, descriptions_array

# Setting base URL for Goodreads
goodreadsURL = 'https://www.goodreads.com/book/show/'

2020-12-02 16:48:35.474 INFO    absl: resolver HttpCompressedFileResolver does not support the provided handle.
2020-12-02 16:48:35.475 INFO    absl: resolver GcsCompressedFileResolver does not support the provided handle.
2020-12-02 16:48:35.476 INFO    absl: resolver HttpUncompressedFileResolver does not support the provided handle.


In [8]:
results, clusters = st.beta_columns(2)
goodreadsLink = st.checkbox('Show Goodreads links.')
#book_title = books[books.title.str.contains(input_text, case=False)].sort_values('weighted_score', ascending=False).title.tolist()[0]
book_title = semanticSearch('Gone Girl', 5)
# with results:
#     # st.markdown(f'## Book recommendations based on *{book_title}*')
#     st.markdown('## Book recommendations based on your input:')
#cosine_similarities, mapping = createSimilarities(books)
#book_recommends = bookRecommendation(book_title=book_title,
#                                    mapping=mapping,
#                                    cosine_similarities=cosine_similarities,
#                                    n_books=n_books)
showInfo(iterator=book_title, #book_recommends,
         n_clusters=8,
         n_results=8,
         n_books=5,
         review_max_len=350)

In [11]:
input_sentences, input_vectors = embedInputs(books,
                                            reviewsAll,
                                            search_param=['Gone Girl'],
                                            review_max_len=350,
                                            searchTitle=False)
model = getClusters(input_vectors=input_vectors,
                    n_clusters=n_clusters)
showClusters(input_sentences=input_sentences,
            input_vectors=input_vectors,
            authorTitle = ['Gillian Flynn'],
            n_clusters=8,
            n_results=8,
            model=model,
            searchTitle=False)

IndexError: single positional indexer is out-of-bounds