# bookcave_paragraphs

## Set Up

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np

import folders
from sites.bookcave import bookcave

In [None]:
def get_top_indices(a, n):
    top_indices = np.argpartition(a, -n)[-n:]
    return top_indices[np.argsort(a[top_indices])]

def get_bottom_indices(a, n):
    bottom_indices = np.argpartition(a, n)[:n]
    return bottom_indices[np.argsort(a[bottom_indices])]

## Features

Get the lengths of all the paragraphs.

In [None]:
text_n_paragraphs = []
text_paragraph_n_tokens = []
fnames = os.listdir(os.path.join(folders.AMAZON_KINDLE_PARAGRAPH_TOKENS_PATH))
for fname in fnames:
    if fname in {'.DS_Store'}:
        continue
    path = os.path.join(folders.AMAZON_KINDLE_PARAGRAPH_TOKENS_PATH, fname)
    n_paragraphs = 0
    paragraph_n_tokens = []
    with open(path, 'r', encoding='utf-8') as fd:
        n_sections = int(fd.readline()[:-1])
        for section_i in range(n_sections):
            section_n_paragraphs = int(fd.readline()[:-1])
            n_paragraphs += section_n_paragraphs
            for _ in range(section_n_paragraphs):
                tokens = fd.readline()[:-1].split(' ')
                paragraph_n_tokens.append(len(tokens))
    text_n_paragraphs.append(n_paragraphs)
    text_paragraph_n_tokens.append(paragraph_n_tokens)
len(text_n_paragraphs), len(text_paragraph_n_tokens)

In [None]:
all_n_tokens = []
for paragraph_n_tokens in text_paragraph_n_tokens:
    for n_tokens in paragraph_n_tokens:
        all_n_tokens.append(n_tokens)
len(all_n_tokens)

### Paragraphs

What does the paragraph-length distribution look like?

In [None]:
min(text_n_paragraphs), max(text_n_paragraphs), sum(text_n_paragraphs)/len(text_n_paragraphs)

In [None]:
plt.figure(figsize=(12, 9))
plt.hist(text_n_paragraphs, 40)
plt.savefig(os.path.join(folders.FIGURES_PATH, 'text_n_paragraphs'), bbox_inches='tight')
plt.show()

Which books have the most paragraphs?

Get a better histogram of paragraph lengths without absurdly monstrous outliers.

In [None]:
text_n_paragraphs_8192 = np.array([n for n in text_n_paragraphs if n <= 8192])
len(text_n_paragraphs_8192), len(text_n_paragraphs_8192)/len(text_n_paragraphs)

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(text_n_paragraphs_8192, 64)
plt.xlabel('Number of paragraphs')
plt.ylabel('Number of texts')
plt.savefig(os.path.join(folders.FIGURES_PATH, 'text_n_paragraphs_8192'), bbox_inches='tight')
plt.show()

Half of the `x` axis of the above histogram is still a long tail. Zoom in to the majority of texts.

In [None]:
text_n_paragraphs_4096 = np.array([n for n in text_n_paragraphs if n <= 4096])
len(text_n_paragraphs_4096), len(text_n_paragraphs_4096)/len(text_n_paragraphs)

In [None]:
plt.hist(text_n_paragraphs_4096, 64)
# plt.savefig(os.path.join(folders.FIGURES_PATH, 'book_majority_paragraph_length'), bbox_inches='tight')
plt.show()

Try to find a reasonable range of paragraphs.

In [None]:
text_n_paragraphs_256_4096 = np.array([n for n in text_n_paragraphs if 256 <= n and n <= 4096])
len(text_n_paragraphs_256_4096), len(text_n_paragraphs_256_4096)/len(text_n_paragraphs)

In [None]:
plt.hist(text_n_paragraphs_256_4096, 64)
plt.show()

### Tokens

In [None]:
plt.figure(figsize=(12,6))
plt.hist(all_n_tokens, 64)
plt.show()

Zoom in.

In [None]:
all_n_tokens_1024 = [n for n in all_n_tokens if n <= 1024]
len(all_n_tokens_1024), len(all_n_tokens_1024)/len(all_n_tokens)

In [None]:
plt.hist(all_n_tokens_1024, 64)
plt.show()

Zoom in again.

In [None]:
all_n_tokens_192 = [n for n in all_n_tokens if n <= 192]
len(all_n_tokens_192), len(all_n_tokens_192)/len(all_n_tokens)

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(all_n_tokens_192, 64)
plt.xlabel('Number of tokens')
plt.ylabel('Number of paragraphs')
plt.savefig(os.path.join(folders.FIGURES_PATH, 'all_n_tokens_192'))
plt.show()

In [None]:
all_n_tokens_128 = [n for n in all_n_tokens if n <= 128]
len(all_n_tokens_128), len(all_n_tokens_128)/len(all_n_tokens)

Find a reasonable range.

In [None]:
print_between(token_lengths, high=64)
print_between(token_lengths, high=128)
print_between(token_lengths, high=140)
print_between(token_lengths, high=160)

In [None]:
print_between(token_lengths, high=2)
print_between(token_lengths, high=3)
print_between(token_lengths, high=4)
print_between(token_lengths, high=5)
print_between(token_lengths, high=6)

In [None]:
print_between(token_lengths, low=6, high=128)