# lines

## Set Up

Import modules.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import nltk
import gensim

import bookcave
import preprocessing
import train_embeddings

Load data.

In [None]:
inputs, Y, categories, levels, \
book_ids, books_df, _, _, _ = bookcave.get_data({'text'},
                                                text_input='filename',
                                                only_categories={1, 3, 5, 6},
                                                return_meta=True)
texts = inputs['text']

## Example

View the labels for an example document.

In [None]:
example_index = np.where('harleigh' == book_ids)[0][0]
example_index

In [None]:
[(Y[example_index][i], levels[i][j]) for i, j in enumerate(Y[example_index])]

View a snippet of the text for this book.

In [None]:
with open(texts[example_index], 'r', encoding='utf-8') as fd:
    text = fd.read()

In [None]:
text[:200]

Split the text into lines.

In [None]:
lines = text.split('\n')

In [None]:
len(lines)

In [None]:
lines[:11]

In [None]:
bad_lines = lines[1403:1427]

Tokenize the lines.

In [None]:
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
processed_lines = list(preprocessing.process_lines(
    tokenizer,
    lines,
    lower=True,
    sentences=False,
    endings={'.', '?', ')', '!', ':', '-', '"', ';', ',', '\''},
    min_len=5,
    normal=True
))

In [None]:
len(processed_lines)

In [None]:
processed_lines[:2]

Load a pre-trained `doc2vec` model.

In [None]:
doc_model = train_embeddings.load_doc_model('docmodel_line_treebank_150d_8w_2min_16e.model')

Convert the lines of the example document into fixed-length vectors.

In [None]:
doc_vectors = np.zeros((len(processed_lines), doc_model.vector_size))
for i, processed_line in enumerate(processed_lines):
    doc_vectors[i] = doc_model.infer_vector(processed_line)

Visualize the sentence embeddings using t-SNE.

In [None]:
sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS

In [None]:
doc_embedded = TSNE(n_components=2, metric='cosine').fit_transform(doc_vectors)
doc_embedded.shape

In [None]:
plt.scatter(doc_embedded[:, 0], doc_embedded[:, 1])
plt.show()

View a clustering of the t-SNE results.

In [None]:
colors = ['red', 'blue', 'green', 'yellow', 'magenta', 'cyan', 'brown']

In [None]:
def plot_clusters(points, labels):
    c = [colors[label] for label in labels]
    plt.scatter(points[:, 0], points[:, 1], c=c)
    plt.show()

In [None]:
clustering_embedded = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average').fit(doc_embedded)
plot_clusters(doc_embedded, clustering_embedded.labels_)

Now fit the multi-dimensional vectors using a clustering algorithm.

In [None]:
clustering_full_3_single = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='single').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_single.labels_)

In [None]:
clustering_full_3_average = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_average.labels_)

In [None]:
clustering_full_3_complete = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3_complete.labels_)

Find some bad lines in the example book.

In [None]:
bad_processed_lines_start = 1328
bad_processed_lines_end = 1351

In [None]:
bad_labels = [1 if i >= bad_processed_lines_start and i <= bad_processed_lines_end else 0 for i in range(len(processed_lines))]
plot_clusters(doc_embedded, bad_labels)

Try the same thing using cosine similarity instead of euclidean distance.

In [None]:
clustering_full_3 = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_3.labels_)

Try 4.

In [None]:
clustering_full_4 = AgglomerativeClustering(n_clusters=4, linkage='ward').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_4.labels_)

What kinds of sentence clusters are we seeing above?

In [None]:
def print_sample_sentences(clustering, size=10):
    for cluster_index in range(clustering.n_clusters):
        print('Cluster {:d} (`{}`):'.format(cluster_index + 1, colors[cluster_index]))
        # Processed sentence IDs (indices).
        psids = np.array([i for i, label in enumerate(clustering.labels_) if label == cluster_index])
        sids = np.array([processed_sentences[psid][0] for psid in psids])
        for sentence in np.random.choice(sentences[sids], size=size, replace=False):
            print('  {}'.format(sentence))

In [None]:
print_sample_sentences(clustering_full_4, size=5)

Try 5.

In [None]:
clustering_full_5 = AgglomerativeClustering(n_clusters=5, linkage='ward').fit(doc_vectors)
plot_clusters(doc_embedded, clustering_full_5.labels_)

In [None]:
print_sample_sentences(clustering_full_5, size=5)