# bookcave_text

## Set Up

In [None]:
# Math.
import numpy as np
# Visualization.
import matplotlib.pyplot as plt
# File I/O.
import os
# Sorting
import operator

# Data.
from sites.bookcave import bookcave

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

## Features

Read all of the text files.

In [None]:
inputs, _, _, _, book_ids, books_df, _, _, _ =\
bookcave.get_data({'text'},
                  text_source='book',
                  text_min_len=6,
                  return_meta=True)

### Text

In [None]:
texts = inputs['text']
len(texts)

See a sample of text.

In [None]:
texts[42][:200]

What does the file-length distribution look like?

In [None]:
text_lengths = np.array([len(text) for text in texts])

In [None]:
plt.hist(text_lengths, 40)
plt.show()

Which are the largest files?

In [None]:
longest_text_length_indices = np.argpartition(text_lengths, -10)[-10:]
sorted_longest_text_length_indices = longest_text_length_indices[np.argsort(text_lengths[longest_text_length_indices])]
longest_book_id_lengths = list(zip(book_ids[sorted_longest_text_length_indices],
                                   text_lengths[sorted_longest_text_length_indices]))
longest_book_id_lengths

Get a better histogram of text lengths without absurdly monstrous outliers.

In [None]:
reasonable_text_lengths = np.array([length for length in text_lengths if length < 5000000])
plt.hist(reasonable_text_lengths, 40)
plt.show()

Half of the `x` axis of the above histogram is still a long tail. Zoom in to the majority of texts.

In [None]:
majority_text_lengths = np.array([length for length in text_lengths if length < 1000000])
plt.hist(majority_text_lengths, 40)
plt.savefig(os.path.join(FIGURES_ROOT, 'book_majority_text_length'), bbox_inches='tight')
plt.show()

View book titles with little text.

In [None]:
shortest_text_length_indices = np.argpartition(text_lengths, 30)[:30]
sorted_shortest_text_length_indices = shortest_text_length_indices[np.argsort(text_lengths[shortest_text_length_indices])]
shortest_book_id_lengths = list(zip(book_ids[sorted_shortest_text_length_indices],
                                    text_lengths[sorted_shortest_text_length_indices]))
shortest_book_id_lengths

View the distribution of number of lines for text files.

In [None]:
text_lines = np.array([text.split('\n') for text in texts])

In [None]:
text_line_lengths = np.array([len(lines) for lines in text_lines])
plt.hist(text_line_lengths, 40)
plt.show()

Zoom in.

In [None]:
majority_text_line_lengths = np.array([length for length in text_line_lengths if length < 10000])
plt.hist(majority_text_line_lengths, 40)
plt.show()

View the distribution of description lengths.

In [None]:
descriptions = books_df['description'].values

In [None]:
description_lengths = np.array([len(description) for description in descriptions])
plt.hist(description_lengths, 40)
plt.show()

In [None]:
description_sentences = [description.split('|') for description in descriptions]
description_sentence_lengths = [len(sentences) for sentences in description_sentences]
plt.hist(description_sentence_lengths, 20)
plt.show()

View distribution of title lengths.

In [None]:
titles = books_df['title'].values

In [None]:
title_lengths = [len(title) for title in titles]
plt.hist(title_lengths, 20)
plt.show()

### Endings

In [None]:
endings = dict()
for text in texts:
    lines = text.splitlines()
    for line in lines:
        if len(line) == 0:
            continue
        try:
            endings[line[-1]] += 1
        except KeyError:
            endings[line[-1]] = 1

In [None]:
ordered_endings_counts = sorted([(c, endings[c]) for c in endings.keys()], key=operator.itemgetter(1), reverse=True)
len(ordered_endings_counts)

In [None]:
ordered_endings_counts

### Beginnings

In [None]:
beginnings = dict()
for text in texts:
    lines = text.splitlines()
    for line in lines:
        if len(line) == 0:
            continue
        try:
            beginnings[line[0]] += 1
        except KeyError:
            beginnings[line[0]] = 1

In [None]:
ordered_beginnings_counts = sorted([(c, beginnings[c]) for c in beginnings.keys()], key=operator.itemgetter(1), reverse=True)
len(ordered_beginnings_counts)

In [None]:
ordered_beginnings_counts

### Tokens

In [None]:
token_inputs, _, _, _, token_book_ids, token_books_df, _, _, _ = bookcave.get_data({'text'}, text_source='tokens', return_meta=True)

In [None]:
token_texts = token_inputs['text']
len(token_texts)

In [None]:
paragraph_token_lengths = []
for section_paragraphs_tokens in token_texts:
    for paragraphs_tokens in section_paragraphs_tokens:
        for tokens in paragraphs_tokens:
            paragraph_token_lengths.append(len(tokens))

In [None]:
plt.hist(paragraph_token_lengths, 40)
plt.show()

In [None]:
majority_paragraph_token_lengths = [length for length in paragraph_token_lengths if length < 300]
plt.hist(majority_paragraph_token_lengths, 40)
plt.show()

In [None]:
over_120_paragraph_token_lengths = [length for length in paragraph_token_lengths if length > 120]
len(over_120_paragraph_token_lengths), len(over_120_paragraph_token_lengths)/len(paragraph_token_lengths)