# bookcave_paragraphs

## Set Up

In [None]:
# Math.
import numpy as np
# Visualization.
import matplotlib.pyplot as plt
# File I/O.
import os
# Sorting
import operator

# Data.
from sites.bookcave import bookcave

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

## Features

Read all of the paragraph files.

In [None]:
inputs, _, _, _, book_ids, books_df, _, _, _ =\
bookcave.get_data({'text'},
                  text_source='paragraphs',
                  return_meta=True)

### Text

In [None]:
texts = inputs['text']
len(texts)

In [None]:
text_paragraphs = [text[0] for text in texts]
text_section_ids = [text[1] for text in texts]
text_sections = [text[2] for text in texts]

See a sample of text.

In [None]:
[(text_sections[42][text_section_ids[42][i]], text_paragraphs[42][i]) for i in range(5)]

What does the paragraph-length distribution look like?

In [None]:
paragraph_lengths = np.array([len(paragraphs) for paragraphs in text_paragraphs])
len(paragraph_lengths)

In [None]:
plt.hist(paragraph_lengths, 40)
plt.show()

Which books have the most paragraphs?

In [None]:
longest_length_indices = np.argpartition(paragraph_lengths, -10)[-10:]
sorted_longest_length_indices = longest_length_indices[np.argsort(paragraph_lengths[longest_length_indices])]
longest_book_id_lengths = list(zip(book_ids[sorted_longest_length_indices],
                                   paragraph_lengths[sorted_longest_length_indices]))
longest_book_id_lengths

Get a better histogram of paragraph lengths without absurdly monstrous outliers.

In [None]:
reasonable_paragraph_lengths = np.array([length for length in paragraph_lengths if length < 10000])
plt.hist(reasonable_paragraph_lengths, 40)
plt.show()

Half of the `x` axis of the above histogram is still a long tail. Zoom in to the majority of texts.

In [None]:
majority_paragraph_lengths = np.array([length for length in paragraph_lengths if length < 4000])
plt.hist(majority_paragraph_lengths, 40)
plt.savefig(os.path.join(FIGURES_ROOT, 'book_majority_paragraph_length'), bbox_inches='tight')
plt.show()

View book titles with few paragraphs.

In [None]:
shortest_length_indices = np.argpartition(paragraph_lengths, 30)[:30]
sorted_shortest_length_indices = shortest_length_indices[np.argsort(paragraph_lengths[shortest_length_indices])]
shortest_book_id_lengths = list(zip(book_ids[sorted_shortest_length_indices],
                                    paragraph_lengths[sorted_shortest_length_indices]))
shortest_book_id_lengths

### Tokens

In [None]:
token_inputs, _, _, _, token_book_ids, token_books_df, _, _, _ =\
    bookcave.get_data({'text'},
                      text_source='tokens',
                      return_meta=True)

In [None]:
token_texts = token_inputs['text']
text_tokens = [text[0] for text in token_texts]
len(text_tokens)

In [None]:
text_tokens[42][:5]

In [None]:
token_lengths = []
for paragraphs_tokens in text_tokens:
    for tokens in paragraphs_tokens:
        token_lengths.append(len(tokens))
len(token_lengths)

In [None]:
plt.hist(token_lengths, 40)
plt.show()

Zoom in.

In [None]:
majority_token_lengths = [length for length in token_lengths if length < 300]
plt.hist(majority_token_lengths, 40)
plt.show()

In [None]:
def get_over(n):
    return [length for length in token_lengths if length > n]

In [None]:
over_120 = get_over(120)
over_140 = get_over(140)
over_160 = get_over(160)
[(len(over), len(over)/len(token_lengths)) for over in [over_120, over_140, over_160]]