# bookcave_paragraphs

## Set Up

In [None]:
import os
import operator

import matplotlib.pyplot as plt
import numpy as np

import folders
from sites.bookcave import bookcave

In [None]:
def get_top_indices(a, n):
    top_indices = np.argpartition(a, -n)[-n:]
    return top_indices[np.argsort(a[top_indices])]

def get_bottom_indices(a, n):
    bottom_indices = np.argpartition(a, n)[:n]
    return bottom_indices[np.argsort(a[bottom_indices])]

## Features

Read all of the paragraph files.

In [None]:
inputs, _, _, _, book_ids, books_df, _, _, _ = \
    bookcave.get_data({'paragraphs', 'paragraph_tokens'},
                      return_meta=True)
paragraph_inputs = inputs['paragraphs']
token_inputs = inputs['paragraph_tokens']

### Paragraphs

In [None]:
text_paragraphs, text_section_ids, text_sections = zip(*paragraph_inputs)
len(text_paragraphs)

See a sample of text.

In [None]:
[(text_sections[42][text_section_ids[42][i]], text_paragraphs[42][i]) for i in range(5)]

What does the paragraph-length distribution look like?

In [None]:
paragraph_lengths = np.array([len(paragraphs) for paragraphs in text_paragraphs])
min(paragraph_lengths), max(paragraph_lengths), sum(paragraph_lengths)/len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths, 40)
plt.show()

Which books have the most paragraphs?

In [None]:
top_paragraph_length_indices = get_top_indices(paragraph_lengths, 10)
for index in top_paragraph_length_indices:
    print('{:d}: {}'.format(paragraph_lengths[index], book_ids[index]))

Get a better histogram of paragraph lengths without absurdly monstrous outliers.

In [None]:
paragraph_lengths_2 = np.array([length for length in paragraph_lengths if length < 10000])
len(paragraph_lengths_2), len(paragraph_lengths_2)/len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths_2, 40)
plt.show()

Half of the `x` axis of the above histogram is still a long tail. Zoom in to the majority of texts.

In [None]:
paragraph_lengths_3 = np.array([length for length in paragraph_lengths if length < 4000])
len(paragraph_lengths_3), len(paragraph_lengths_3)/len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths_3, 40)
# plt.savefig(os.path.join(folders.FIGURES_PATH, 'book_majority_paragraph_length'), bbox_inches='tight')
plt.show()

Try to find a reasonable range of paragraphs.

In [None]:
paragraph_lengths_4 = np.array([length for length in paragraph_lengths if 250 <= length and length <= 7500])
len(paragraph_lengths_4), len(paragraph_lengths_4)/len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths_4, 40)
plt.show()

If we had to specify a fixed number of paragraphs `n` to pass to a model, what would `n` be?

In [None]:
def print_between(values, low=None, high=None):
    between = [value for value in values if (low is None or value >= low) and (high is None or value <= high)]
    print('low={}; high={}; len={:d}; percentile={:.4f}'.format(low, high, len(between), len(between)/len(values)))

In [None]:
print_between(paragraph_lengths_4, high=512)
print_between(paragraph_lengths_4, high=1024)
print_between(paragraph_lengths_4, high=2048)
print_between(paragraph_lengths_4, high=4096)

View book titles with few paragraphs.

In [None]:
bottom_paragraph_length_indices = get_bottom_indices(paragraph_lengths, 512)
for index in bottom_paragraph_length_indices:
    print('{:>3d}: {}'.format(paragraph_lengths[index], book_ids[index]))

In [None]:
paragraph_lengths_6 = np.array([length for length in paragraph_lengths if length < 1000])
len(paragraph_lengths_6), len(paragraph_lengths_6)/len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths_6, 40)
plt.show()

### Tokens

In [None]:
text_tokens, _ = zip(*token_inputs)
len(text_tokens)

In [None]:
text_tokens[42][:5]

In [None]:
token_locations = []
token_lengths = []
for text_i, paragraphs_tokens in enumerate(text_tokens):
    for paragraph_i, tokens in enumerate(paragraphs_tokens):
        token_locations.append((text_i, paragraph_i))
        token_lengths.append(len(tokens))
token_lengths = np.array(token_lengths)
len(token_lengths), min(token_lengths), max(token_lengths), sum(token_lengths)/len(token_lengths)

In [None]:
plt.hist(token_lengths, 40)
plt.show()

In [None]:
top_token_length_indices = get_top_indices(token_lengths, 32)
for index in top_token_length_indices:
    token_location = token_locations[index]
    text_i, paragraph_i = token_location[0], token_location[1]
    print('{:d}: {} ({:d})'.format(token_lengths[index], book_ids[text_i], paragraph_i))

Zoom in.

In [None]:
token_lengths_2 = [length for length in token_lengths if length < 2500]
len(token_lengths_2), len(token_lengths_2)/len(token_lengths)

In [None]:
plt.hist(token_lengths_2, 40)
plt.show()

Zoom in again.

In [None]:
token_lengths_3 = [length for length in token_lengths if length < 1024]
len(token_lengths_3), len(token_lengths_3)/len(token_lengths)

In [None]:
plt.hist(token_lengths_3, 40)
plt.show()

Zoom in again again.

In [None]:
token_lengths_4 = [length for length in token_lengths if length < 512]
len(token_lengths_4), len(token_lengths_4)/len(token_lengths)

In [None]:
plt.hist(token_lengths_4, 40)
plt.show()

Zoom in again again again.

In [None]:
token_lengths_5 = [length for length in token_lengths if length < 256]
len(token_lengths_5), len(token_lengths_5)/len(token_lengths)

In [None]:
plt.hist(token_lengths_5, 40)
plt.show()

Find a reasonable range.

In [None]:
print_between(token_lengths, high=64)
print_between(token_lengths, high=128)
print_between(token_lengths, high=140)
print_between(token_lengths, high=160)

In [None]:
print_between(token_lengths, high=2)
print_between(token_lengths, high=3)
print_between(token_lengths, high=4)
print_between(token_lengths, high=5)
print_between(token_lengths, high=6)

In [None]:
print_between(token_lengths, low=6, high=128)