# BookCave

[General FAQ](https://mybookcave.com/reader-faq/)

[BookCave rating system](https://mybookcave.com/reader-faq/what-makes-a-book-earn-a-particular-content-rating/)

## Setup

In [None]:
# Data.
import bookcave
# Math.
import numpy as np
# File I/O.
import os
# Visualization.
import matplotlib.pyplot as plt

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

Read all of the data and metadata from the BookCave database.

In [None]:
texts, y, categories, levels, all_books_df, rated_books_df, books_df, ratings_df, levels_df, categories_df = bookcave.get_data(return_meta=True)

Index book IDs to use later.

In [None]:
book_ids = np.array([book_row['id'] for _, book_row in books_df.iterrows()])
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

Plot label distributions.

In [None]:
def autolabel(ax, bars, text_color):
    heights = [bar.get_height() for bar in bars]
    max_height = max(heights)
    for i, bar in enumerate(bars):
        height = heights[i]
        is_max = height == max_height
        ax.text(bar.get_x() + bar.get_width()/2.,
                height - .05 * max_height if is_max else height + .05 * max_height,
                '{}'.format(int(height)),
                ha='center',
                va='top' if is_max else 'bottom',
                color=text_color if is_max else 'k')


def plot_label_distribution(x, labels, title, ylabel, color, text_color, save=None, size=None):
    bincount = np.bincount(labels)
    fig, ax = plt.subplots()
    if size:
        fig.set_size_inches(*size)
    bars = ax.bar(x, bincount, color=color)
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(x, rotation=45, ha='right')
    autolabel(ax, bars, text_color)
    if save:
        plt.savefig(save, bbox_inches='tight')
    plt.show()


def print_label_distribution(x, labels):
    longest_name = max(len(name) for name in x)
    bincount = np.bincount(labels)
    for index, name in enumerate(x):
        spaces = ' ' * (longest_name - len(name))
        p = bincount[index]/len(labels)
        tens_digit_space = ' ' if p < .1 else ''
        print('{}`{}`: {}{:.3%}'.format(spaces, name, tens_digit_space, p))

Plot distribution of overall (average) ratings for books.

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']
# rating_text_colors = ['w', 'w', 'k', 'k', 'k', 'k', 'w']


def plot_and_print_average_ratings_distribution(books_df):
    average_ratings = books_df['community_average_rating']
    # Transform string label to integer.
    labels = [rating_to_index[rating] for rating in average_ratings]
    title = 'Average Ratings'
    ylabel = '# of books'
    color = rating_colors
    text_color = 'k'
    plot_label_distribution(ratings, labels, title, ylabel, color, text_color)
    print_label_distribution(ratings, labels)

### Books

Count the total number of books in the database.

In [None]:
len(all_books_df)

Count the number of books which have at least one rating.

In [None]:
len(rated_books_df)

See some samples of book data.

In [None]:
rated_books_df.head(3)

View the distribution of vendors who offer each book.

In [None]:
rated_books_df.count()

Calculate percentage of books available through Amazon Kindle.

In [None]:
num_kindle = len(rated_books_df[rated_books_df['amazon_kindle_url'].notnull()])
'{:.3%} of rated books ({:d}/{:d}) are available on Amazon Kindle.'.format(num_kindle / len(rated_books_df), num_kindle, len(rated_books_df))

Plot distribution of overall (average) ratings for rated books.

In [None]:
plot_and_print_average_ratings_distribution(rated_books_df)

Analyze distribution of genres.

In [None]:
book_genre_values = books_df['genres'].dropna().values
book_genres = [value.split('|') for value in book_genre_values]
book_genre_hierarchies = [[genre.split('/') for genre in genres] for genres in book_genres]

In [None]:
root_genres = ['Juvenile Fiction', 'Young Adult Fiction', 'Fiction', 'Juvenile Nonfiction', 'Young Adult Nonfiction', 'Nonfiction']
root_genre_to_index = {root_genre: i for i, root_genre in enumerate(root_genres)}
root_genre_y = []
for i, book_genre_hierarchy in enumerate(book_genre_hierarchies):
    for hierarchy in book_genre_hierarchy:
        root_genre_y.append(root_genre_to_index[hierarchy[0]])

In [None]:
plot_label_distribution(root_genres, root_genre_y, 'Genres', '# of books', 'olive', 'w')

Plot distribution of all genres under `Fiction`.

In [None]:
fiction_genres_set = set()
fiction_genre_values = []
for book_genre_hierarchy in book_genre_hierarchies:
    for hierarchy in book_genre_hierarchy:
        if hierarchy[0] != 'Fiction':
            continue
        sub_genre = hierarchy[1] if len(hierarchy) > 1 else '_'
        fiction_genres_set.add(sub_genre)
        fiction_genre_values.append(sub_genre)
fiction_genres = sorted(list(fiction_genres_set))
fiction_genre_to_index = {genre: i for i, genre in enumerate(fiction_genres)}
fiction_genre_y = [fiction_genre_to_index[value] for value in fiction_genre_values]

In [None]:
plot_label_distribution(fiction_genres, fiction_genre_y, 'Fiction', '# of books', 'violet', 'k', size=(14, 7))

### (Rating) Levels

Count the total number of rating levels (which belong to categories) given to books.

In [None]:
len(levels_df)

See some examples of rating levels.

In [None]:
levels_df.head()

### Categories

Many levels are mutually exclusive for a particular book. They are grouped into content 'categories'.

In [None]:
len(categories_df)

See examples of categorized rating levels.

In [None]:
categories_df.head(9)

View the list of unique category names.

In [None]:
categories

## Features

### Text

Count how many Kindle texts exist for books with at least one rating and have text.

In [None]:
len(books_df)

See a sample of text.

In [None]:
texts[book_id_to_index['gate-76']][:200]

View the distribution of overall (average) ratings for books with text.

In [None]:
plot_and_print_average_ratings_distribution(books_df)

What does the file-length distribution look like?

In [None]:
text_lengths = [len(text) for text in texts]

In [None]:
plt.hist(text_lengths, 60, density=True)
plt.show()

View the distribution of number of lines for text files.

In [None]:
text_lines = bookcave.get_text_lines(texts)
text_line_lengths = [len(lines) for lines in text_lines]

In [None]:
plt.hist(text_line_lengths, 40, density=True)
plt.show()

View book titles with little text.

In [None]:
[(book_ids[i], len(text)) for i, text in enumerate(texts) if len(text) < 15000]

### Labels

See an example of book rating levels.

In [None]:
levels_df[levels_df['book_id'] == 'gate-76']

See how these rating levels are now represented as a vector of average rating levels per category.

In [None]:
y[book_id_to_index['gate-76']]

Plot the label distribution for each category.

In [None]:
category_colors = ['blue', 'green', 'pink', 'cyan', 'magenta', 'yellow', 'orange', 'red']
category_text_colors = ['w', 'w', 'k', 'k', 'k', 'k', 'k', 'w']

        
def plot_and_print_label_distributions():
    for category_index, category in enumerate(categories):
        x = levels[category_index]
        labels = y[:, category_index]
        title = category
        ylabel = '# of occurrences'
        color = category_colors[category_index]
        text_color = category_text_colors[category_index]
        save = os.path.join(FIGURES_ROOT, 'kindle_label_dist_{}_{}.png'.format(category_index + 1, category))
        plot_label_distribution(x, labels, title, ylabel, color, text_color, save=save)
        print_label_distribution(x, labels)


plot_and_print_label_distributions()