# bookcave_meta

[General FAQ](https://mybookcave.com/reader-faq/)

[BookCave rating system](https://mybookcave.com/reader-faq/what-makes-a-book-earn-a-particular-content-rating/)

## Set Up

In [None]:
# Math.
import numpy as np
# Visualization.
import matplotlib.pyplot as plt
# File I/O.
import os

# Data.
import bookcave

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

Plot label distributions.

In [None]:
def autolabel(ax, bars, text_color):
    heights = [bar.get_height() for bar in bars]
    max_height = max(heights)
    for i, bar in enumerate(bars):
        height = heights[i]
        is_max = height == max_height
        ax.text(bar.get_x() + bar.get_width()/2.,
                height - .05 * max_height if is_max else height + .05 * max_height,
                '{}'.format(int(height)),
                ha='center',
                va='top' if is_max else 'bottom',
                color=text_color if is_max else 'k')


def plot_label_distribution(x, labels, title, ylabel, color, text_color='k', save=None, size=None):
    bincount = np.bincount(labels)
    fig, ax = plt.subplots()
    if size:
        fig.set_size_inches(*size)
    bars = ax.bar(x, bincount, color=color)
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(x, rotation=45, ha='right')
    autolabel(ax, bars, text_color)
    if save:
        plt.savefig(save, bbox_inches='tight')
    plt.show()
    return bincount


def print_label_distribution(x, labels, bincount=None):
    longest_name = max(len(name) for name in x)
    if bincount is None:
        bincount = np.bincount(labels)
    for index, name in enumerate(x):
        spaces = ' ' * (longest_name - len(name))
        p = bincount[index]/len(labels)
        tens_digit_space = ' ' if p < .1 else ''
        print('{}`{}`: {}{:.3%}'.format(spaces, name, tens_digit_space, p))

Plot distribution of overall (average) ratings for books.

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']
# rating_text_colors = ['w', 'w', 'k', 'k', 'k', 'k', 'w']


def plot_and_print_average_ratings_distribution(books_df):
    average_ratings = books_df['community_average_rating']
    # Transform string label to integer.
    labels = [rating_to_index[rating] for rating in average_ratings]
    title = 'Average Ratings'
    ylabel = '# of books'
    color = rating_colors
    text_color = 'k'
    bincount = plot_label_distribution(ratings, labels, title, ylabel, color, text_color)
    print_label_distribution(ratings, labels, bincount=bincount)
    return bincount

## Metadata

Read all of the data and metadata from the BookCave database.

In [None]:
_, Y, categories, levels,\
book_ids, books_df, ratings_df, levels_df, categories_df\
= bookcave.get_data({'text'},
                    only_categories={1, 3, 5 ,6},
                    return_meta=True)

Index book IDs to use later.

In [None]:
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

### Books

Count the number of books which are rated and for which the book text has been scraped.

In [None]:
len(books_df)

See some samples of book data.

In [None]:
books_df.head(3)

#### Overall Ratings

View the distribution of overall (average) ratings for books with text.

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']


def plot_and_print_average_ratings_distribution(books_df):
    average_ratings = books_df['community_average_rating']
    # Transform string label to integer.
    labels = [rating_to_index[rating] for rating in average_ratings]
    title = 'Average Ratings'
    ylabel = '# of books'
    color = rating_colors
    text_color = 'k'
    bincount = plot_label_distribution(ratings, labels, title, ylabel, color, text_color)
    print_label_distribution(ratings, labels, bincount=bincount)
    return bincount

_ = plot_and_print_average_ratings_distribution(books_df)

#### Genres

Analyze distribution of genres.

In [None]:
book_genre_values = books_df['genres'].dropna().values
book_genres = [value.split('|') for value in book_genre_values]
book_genre_hierarchies = [[genre.split('/') for genre in genres] for genres in book_genres]

root_genres = ['Juvenile Fiction',
               'Young Adult Fiction',
               'Fiction',
               'Juvenile Nonfiction',
               'Young Adult Nonfiction',
               'Nonfiction']
root_genre_to_index = {root_genre: i for i, root_genre in enumerate(root_genres)}
root_genre_y = []
print('Books with no genre listed:')
no_genre_count = 0
for i, book_genre_hierarchy in enumerate(book_genre_hierarchies):
    for hierarchy in book_genre_hierarchy:
        try:
            root_genre_y.append(root_genre_to_index[hierarchy[0]])
        except KeyError:
            no_genre_count += 1
            print('{:d}: {}'.format(no_genre_count, books_df.iloc[i]['id']))

In [None]:
_ = plot_label_distribution(root_genres, root_genre_y, 'Genres', '# of books', 'olive', 'w')

Plot distribution of all genres under `Fiction`.

In [None]:
fiction_genres_set = set()
fiction_genre_values = []
for book_genre_hierarchy in book_genre_hierarchies:
    for hierarchy in book_genre_hierarchy:
#         if hierarchy[0] != 'Juvenile Fiction'\
#         and hierarchy[0] != 'Young Adult Fiction'\
#         and hierarchy[0] != 'Fiction':
        if hierarchy[0] != 'Fiction':
            continue
        sub_genre = hierarchy[1] if len(hierarchy) > 1 else '_'
        fiction_genres_set.add(sub_genre)
        fiction_genre_values.append(sub_genre)
fiction_genres = sorted(list(fiction_genres_set))
fiction_genre_to_index = {genre: i for i, genre in enumerate(fiction_genres)}
fiction_genre_y = [fiction_genre_to_index[value] for value in fiction_genre_values]

fiction_genres_bincount = plot_label_distribution(fiction_genres,
                                                  fiction_genre_y,
                                                  'Fiction',
                                                  '# of books',
                                                  'violet',
                                                  'k',
                                                  size=(14, 7))

In [None]:
fiction_romance_count = fiction_genres_bincount[fiction_genre_to_index['Romance']]
print('{:.2%} ({:d}/{:d}) of all genres are `Fiction/Romance`.'.format(fiction_romance_count / len(root_genre_y),
                                                                       fiction_romance_count,
                                                                       len(root_genre_y)))
print('{:.2%} ({:d}/{:d}) of `Fiction` genres are `Romance`.'.format(fiction_romance_count / len(fiction_genre_y),
                                                                     fiction_romance_count,
                                                                     len(fiction_genre_y)))

Plot distribution of `Fiction/Romance` novels.

In [None]:
romance_genres_set = set()
romance_genre_values = []
for book_genre_hierarchy in book_genre_hierarchies:
    for hierarchy in book_genre_hierarchy:
        if len(hierarchy) < 2:
            continue
        if hierarchy[0] != 'Fiction' or hierarchy[1] != 'Romance':
            continue
        sub_sub_genre = hierarchy[2] if len(hierarchy) > 2 else '_'
        romance_genres_set.add(sub_sub_genre)
        romance_genre_values.append(sub_sub_genre)
romance_genres = sorted(list(romance_genres_set))
romance_genre_to_index = {genre: i for i, genre in enumerate(romance_genres)}
romance_genre_y = [romance_genre_to_index[value] for value in romance_genre_values]

romance_genres_bincount = plot_label_distribution(romance_genres,
                                                  romance_genre_y,
                                                  'Romance',
                                                  '# of books',
                                                  'pink',
                                                  'k',
                                                  size=(10, 5))

### Rating Levels

Count the total number of rating levels (which belong to categories) given to books.

In [None]:
len(levels_df)

See some examples of rating levels.

In [None]:
levels_df.head()

### Categories

Many levels are mutually exclusive for a particular book. They are grouped into content 'categories'.

In [None]:
len(categories_df)

See examples of categorized rating levels.

In [None]:
categories_df.head(9)

View the list of unique category names.

In [None]:
categories

## Labels

See an example of book rating levels.

In [None]:
[levels[category_index][level_index] for category_index, level_index in enumerate(Y[42])]

See how these rating levels are now represented as a vector of average rating levels per category.

In [None]:
Y[42]

Plot the label distribution for each category.

In [None]:
def plot_and_print_label_distributions():
    for category_index, category in enumerate(categories):
        # Get rating-associated colors.
        category_level_df = categories_df[categories_df['category'].str.match(category)]
        if category_level_df['rating'].isnull().any():
            color='r'
        else:
            color = [rating_colors[rating_to_index[row['rating']]] for _, row in category_level_df.iterrows()]
        
        x = levels[category_index]
        y = Y[:, category_index]
        title = category
        ylabel = '# of occurrences'
        text_color = 'k'
        save = os.path.join(FIGURES_ROOT, 'kindle_label_dist_{}_{}.png'.format(category_index + 1, category))
        bincount = plot_label_distribution(x, y, title, ylabel, color, text_color, save=save)
        print_label_distribution(x, y, bincount=bincount)

plot_and_print_label_distributions()

Which books are the minority in their class?

In [None]:
levels_df[levels_df['title'].str.match(levels[0][-1])]