# sentences

## Set Up

Import modules.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import gensim
import re

import bookcave
import train_embeddings

Load data.

In [None]:
inputs, Y, categories, levels, \
book_ids, books_df, _, _, _ = bookcave.get_data({'text', 'images'},
                                                text_input='filename',
                                                return_meta=True)

## Example

View the labels for an example document.

In [None]:
book_ids[20:30]

In [None]:
[(Y[20][i], levels[i][j]) for i, j in enumerate(Y[20])]

View a snippet of the text for this book.

In [None]:
with open(texts[42], 'r', encoding='utf-8') as fd:
    text = fd.read()

In [None]:
text[:200]

Split the text into lines.

In [None]:
lines = text.split('\n')

In [None]:
len(lines)

In [None]:
lines[:13]

Split the lines into sentences.

In [None]:
def get_sentences(lines, line_endings=None):
    for line in lines:
        for sentence in nltk.sent_tokenize(line):
            yield sentence

In [None]:
sentences = np.array(list(get_sentences(lines)))

In [None]:
len(sentences)

In [None]:
sentences[:20]

In [None]:
sentences[114]

In [None]:
normalized_example = re.sub("[“”]", '"', sentences[114])
normalized_example

In [None]:
nltk.sent_tokenize(normalized_example)

Tokenize the sentences.

In [None]:
def get_processed_sentences(sentences):
    # Sentence ID, or the ID number (index) of the original sentence in `sentences`.
    for sid, sentence in enumerate(sentences):
        tokens = list(gensim.utils.simple_preprocess(sentence))
#         tokens = list(gensim.utils.tokenize(sentence, lowercase=True))
        if len(tokens) > 0:
            yield (sid, tokens)

In [None]:
processed_sentences = list(get_processed_sentences(sentences))

In [None]:
len(processed_sentences)

In [None]:
processed_sentences[:23]

Load a pre-trained `doc2vec` model.

In [None]:
doc_model = train_embeddings.load_doc_model('docmodel_sentence_50d_8w_2min_20e.model')

Convert the sentences of the example document into fixed-length vectors.

In [None]:
doc_vectors = np.zeros((len(processed_sentences), doc_model.vector_size))
for i, (sid, processed_sentence) in enumerate(processed_sentences):
    doc_vectors[i] = doc_model.infer_vector(processed_sentence)

Visualize the sentence embeddings using t-SNE.

In [None]:
sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS

In [None]:
doc_embedded = TSNE(n_components=2, metric='euclidean').fit_transform(doc_vectors)
doc_embedded.shape

In [None]:
plt.scatter(doc_embedded[:, 0], doc_embedded[:, 1])
plt.show()

View a clustering of the t-SNE results.

In [None]:
colors = ['red', 'blue', 'green', 'yellow', 'magenta', 'cyan', 'brown']

In [None]:
def plot_clusters(points, labels):
    c = [colors[label] for label in labels]
    plt.scatter(points[:, 0], points[:, 1], c=c)
    plt.show()

In [None]:
clustering_embedded_single = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single').fit(doc_embedded)
plot_clusters(doc_embedded, clustering_embedded_single.labels_)

In [None]:
clustering_embedded_complete = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete').fit(doc_embedded)
plot_clusters(doc_embedded, clustering_embedded_complete.labels_)

In [None]:
clustering_embedded_average = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average').fit(doc_embedded)
plot_clusters(doc_embedded, clustering_embedded_average.labels_)

In [None]:
clustering_embedded_ward = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward').fit(doc_embedded)
plot_clusters(doc_embedded, clustering_embedded_ward.labels_)

Now fit the multi-dimensional vectors using a clustering algorithm.

In [None]:
clustering_full_3_single = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_single.labels_)

In [None]:
clustering_full_3_complete = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_complete.labels_)

In [None]:
clustering_full_3_average = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_average.labels_)

In [None]:
clustering_full_3_ward = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_ward.labels_)

Try 4.

In [None]:
clustering_full_4_single = AgglomerativeClustering(n_clusters=4, affinity='cosine', linkage='single').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_4_single.labels_)

In [None]:
clustering_full_4_complete = AgglomerativeClustering(n_clusters=4, affinity='cosine', linkage='complete').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_4_complete.labels_)

In [None]:
clustering_full_4_average = AgglomerativeClustering(n_clusters=4, affinity='cosine', linkage='average').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_4_average.labels_)

What kinds of sentence clusters are we seeing above?

In [None]:
def print_sample_sentences(clustering, size=10):
    for cluster_index in range(clustering.n_clusters):
        print('Cluster {:d} (`{}`):'.format(cluster_index + 1, colors[cluster_index]))
        # Processed sentence IDs (indices).
        psids = np.array([i for i, label in enumerate(clustering.labels_) if label == cluster_index])
        sids = np.array([processed_sentences[psid][0] for psid in psids])
        for sentence in np.random.choice(sentences[sids], size=size, replace=False):
            print('  {}'.format(sentence))

In [None]:
print_sample_sentences(clustering_full_4, size=5)

Try 5.

In [None]:
clustering_full_5 = AgglomerativeClustering(n_clusters=5, linkage='ward').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_5.labels_)

In [None]:
print_sample_sentences(clustering_full_5, size=5)