In [1]:
import os
import glob
import itertools
import warnings
import pandas as pd
from umap import UMAP
from sklearn.cluster import DBSCAN
import tensorflow as tf
import tensorflow_hub as hub
import plotly.express as px


os.environ['TFHUB_CACHE_DIR'] = 'sentence_encoder'
ENCODER_URL = 'https://tfhub.dev/google/universal-sentence-encoder/4'

HP_TEXT_CSV_PATHS = glob.glob('data/parsed_books/*')

# Will take a while to run the first time when model downloads
# after that, it will be in a local directory and load quicker
sentence_encoder = hub.load(ENCODER_URL)

In [2]:
def count_alpha(string):
    return sum(char.isalpha() for char in string)


def chunker(seq, size):
    # Source: https://stackoverflow.com/a/434328/5731525
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


def sentence_embeddings_df(text, chunk_size=500):
    """Process text with universal-sentence-encoder to a pd.DataFrame
    
    :param text: (list-like) text to process
    :param chunk_size: (int) how many documents from text to process at once
    :return: pd.DataFrame of shape (len(text), 513)
             Where there's 1 column for the input text and 512 for the corresponding embedding
    """
    embedding_dfs = []
    for text_chunk in chunker(text, chunk_size):
        # Actual embedding
        embeddings = sentence_encoder(text_chunk).numpy()
        
        # Combine into single df
        chunk_text_df = pd.DataFrame(text_chunk).reset_index(drop=True)
        chunk_embedding_df = pd.DataFrame(embeddings)
        chunk_df = pd.concat((chunk_text_df, chunk_embedding_df), axis=1)
        
        embedding_dfs.append(chunk_df)
        
    embedding_df = pd.concat(embedding_dfs).reset_index(drop=True)
    return embedding_df


def insert_newlines(string, every=20, line_break='</br>'):
    # Source: https://stackoverflow.com/a/2657733/5731525
    lines = []
    for i in range(0, len(string), every):
        lines.append(string[i:i + every])
    return line_break.join(lines)

In [3]:
# Let's just look at the philosopher's stone
filtered_csv_paths = [p for p in HP_TEXT_CSV_PATHS if 'stone' in p]
csv_path = filtered_csv_paths[0]

In [4]:
hp_book_df = pd.read_csv(csv_path).dropna().reset_index(drop=True)
hp_book_df['sentence'] = hp_book_df['sentence'].str.strip()

# Filter to have at least n alpha chars in 'sentence'
min_alpha = 5
char_filter = hp_book_df['sentence'].apply(count_alpha) >= min_alpha
hp_book_df = hp_book_df.loc[char_filter, :]

# Count 'words' by counting spaces
min_words = 4
char_filter = hp_book_df['sentence'].str.count(' ') >= min_words - 1
hp_book_df = hp_book_df.loc[char_filter, :]

print(hp_book_df.shape)
hp_book_df.head()

(6023, 3)


Unnamed: 0,chapter_num,sentence,file_name
0,1,THE BOY WHO LIVED,philosophers_stone.txt
1,1,"Mr. and Mrs. Dursley, of number four, Privet D...",philosophers_stone.txt
2,1,They were the last people you'd expect to be i...,philosophers_stone.txt
3,1,Mr. Dursley was the director of a firm called ...,philosophers_stone.txt
4,1,"He was a big, beefy man with hardly any neck, ...",philosophers_stone.txt


In [5]:
embeddings_df = sentence_embeddings_df(hp_book_df['sentence'])
embeddings_df.head(3)

Unnamed: 0,sentence,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,THE BOY WHO LIVED,-0.005383,-0.023493,-0.028015,-0.006081,-0.063641,-0.022133,-0.012766,-0.006723,0.055432,...,0.023178,-0.104541,-0.049622,-0.009009,0.044781,0.032995,0.038262,-0.005083,0.089757,0.016945
1,"Mr. and Mrs. Dursley, of number four, Privet D...",0.014198,-0.037407,-0.003704,0.032097,-0.006503,0.00879,0.010117,0.022652,-0.106851,...,0.050423,0.034595,0.087096,0.025503,-0.034519,-0.000702,0.004414,-0.002548,0.095096,0.0186
2,They were the last people you'd expect to be i...,-0.075628,0.027473,-0.023312,-0.046218,-0.026587,-0.058913,0.003036,-0.006074,-0.11947,...,0.045194,-0.005083,0.058767,0.043894,0.021969,-0.037246,-0.050596,0.001576,0.065681,0.001555


In [6]:
X = embeddings_df.drop(columns=['sentence'])

# This dude is loud...
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    mapper = UMAP(n_neighbors=20, n_components=2)
    X_2d = mapper.fit_transform(X)

embeddings_2d_df = pd.DataFrame(X_2d, columns=['x1', 'x2'])
embeddings_2d_df['sentence'] = embeddings_df['sentence']

# Insert html line breaks for display in plotly
# Will not respect word boundaries (ie might insert linebreak mid-word)
embeddings_2d_df['sentence'] = embeddings_2d_df['sentence'].apply(insert_newlines)
embeddings_2d_df.head()

Unnamed: 0,x1,x2,sentence
0,6.184361,-2.073796,THE BOY WHO LIVED
1,7.351818,-7.161103,"Mr. and Mrs. Dursley</br>, of number four, Pr<..."
2,6.94429,-3.49503,They were the last p</br>eople you'd expect t<...
3,7.847268,-6.352746,Mr. Dursley was the </br>director of a firm c<...
4,9.042587,-3.287453,"He was a big, beefy </br>man with hardly any <..."


In [7]:
X = embeddings_2d_df[['x1', 'x2']]

clst = DBSCAN(eps=0.1, min_samples=15)
clst.fit(X)

embeddings_2d_df['clst_label'] = clst.labels_
embeddings_2d_df.head()

Unnamed: 0,x1,x2,sentence,clst_label
0,6.184361,-2.073796,THE BOY WHO LIVED,-1
1,7.351818,-7.161103,"Mr. and Mrs. Dursley</br>, of number four, Pr<...",-1
2,6.94429,-3.49503,They were the last p</br>eople you'd expect t<...,-1
3,7.847268,-6.352746,Mr. Dursley was the </br>director of a firm c<...,-1
4,9.042587,-3.287453,"He was a big, beefy </br>man with hardly any <...",-1


In [11]:
# Remove DBSCAN's outlier cluster
plot_df = embeddings_2d_df[embeddings_2d_df['clst_label'] != -1].copy()

# Make cluster label a string so plotly colors like categorical rather than continuous
plot_df['clst_label'] = "'" + plot_df['clst_label'].astype(str) + "'"

fig = px.scatter(data_frame=plot_df,
                 x='x1',
                 y='x2', 
                 color='clst_label',
                 hover_name='sentence',
                 title="Sentences from: Harry Potter and The Philospher's Stone<br>"
                       "<sub>Processed with Universal Sentence Encoder -> "
                       "Dim Reduced with UMAP -> Clustered with DBSCAN</sub>")

fig.update_layout(showlegend=False)
fig.show()