# bookcave_text

## Set Up

In [None]:
# Math.
import numpy as np
# Visualization.
import matplotlib.pyplot as plt
# File I/O.
import os

# Data.
import bookcave

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

Read all of the text files.

In [None]:
inputs, _, _, _, book_ids, _, _, _, _, _, _ = bookcave.get_data({'text'}, return_meta=True)

## Features

### Text

In [None]:
texts = inputs['text']
len(texts)

See a sample of text.

In [None]:
texts[42][:200]

What does the file-length distribution look like?

In [None]:
text_lengths = np.array([len(text) for text in texts])

In [None]:
plt.hist(text_lengths, 40)
plt.show()

Which are the largest files?

In [None]:
longest_text_length_indices = np.argpartition(text_lengths, -10)[-10:]
sorted_longest_text_length_indices = longest_text_length_indices[np.argsort(text_lengths[longest_text_length_indices])]
longest_book_id_lengths = list(zip(book_ids[sorted_longest_text_length_indices],
                                   text_lengths[sorted_longest_text_length_indices]))
longest_book_id_lengths

Get a better histogram of text lengths without absurdly monstrous outliers.

In [None]:
reasonable_text_lengths = np.array([length for length in text_lengths if length < 5000000])
plt.hist(reasonable_text_lengths, 40)
plt.show()

Half of the `x` axis of the above histogram is still a long tail. Zoom in to the majority of texts.

In [None]:
majority_text_lengths = np.array([length for length in text_lengths if length < 1000000])
plt.hist(majority_text_lengths, 40)
plt.savefig(os.path.join(FIGURES_ROOT, 'book_majority_text_length'), bbox_inches='tight')
plt.show()

View book titles with little text.

In [None]:
shortest_text_length_indices = np.argpartition(text_lengths, 30)[:30]
sorted_shortest_text_length_indices = shortest_text_length_indices[np.argsort(text_lengths[shortest_text_length_indices])]
shortest_book_id_lengths = list(zip(book_ids[sorted_shortest_text_length_indices],
                                    text_lengths[sorted_shortest_text_length_indices]))
shortest_book_id_lengths

View the distribution of number of lines for text files.

In [None]:
text_lines = np.array([text.split('\n') for text in texts])

In [None]:
text_line_lengths = np.array([len(lines) for lines in text_lines])
plt.hist(text_line_lengths, 40)
plt.show()

Zoom in.

In [None]:
majority_text_line_lengths = np.array([length for length in text_line_lengths if length < 10000])
plt.hist(majority_text_line_lengths, 40)
plt.show()