# BookCave

[General FAQ](https://mybookcave.com/reader-faq/)

[BookCave rating system](https://mybookcave.com/reader-faq/what-makes-a-book-earn-a-particular-content-rating/)

## Setup

In [None]:
# Data manipulation.
import sqlite3
import pandas as pd
# File I/O.
import os
import sys
# Math.
import numpy as np
# Time management.
import tqdm
# Visualization.
import matplotlib.pyplot as plt

Declare file path constants.

In [None]:
CONTENT_ROOT = os.path.join('..', 'content')
BOOKCAVE_ROOT = os.path.join(CONTENT_ROOT, 'bookcave')
BOOKCAVE_AMAZON_KINDLE_ROOT = os.path.join(CONTENT_ROOT, 'bookcave_amazon_kindle')
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

Plot stuff later.

In [None]:
def autolabel(ax, bars, text_color):
    heights = [bar.get_height() for bar in bars]
    max_height = max(heights)
    for i, bar in enumerate(bars):
        height = heights[i]
        is_max = height == max_height
        ax.text(bar.get_x() + bar.get_width()/2.,
                height - .05 * max_height if is_max else height + .05 * max_height,
                '{}'.format(int(height)),
                ha='center',
                va='top' if is_max else 'bottom',
                color=text_color if is_max else 'k')


def plot_label_distribution(x, labels, title, ylabel, color, text_color, save=None):
    bincount = np.bincount(labels)
    fig, ax = plt.subplots()
    bars = ax.bar(x, bincount, color=color)
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(x, rotation=45, ha='right')
    autolabel(ax, bars, text_color)
    if save:
        plt.savefig(save, bbox_inches='tight')
    plt.show()


def print_label_distribution(x, labels):
    longest_name = max(len(name) for name in x)
    bincount = np.bincount(labels)
    for index, name in enumerate(x):
        spaces = ' ' * (longest_name - len(name))
        p = bincount[index]/len(labels)
        tens_digit_space = ' ' if p < .1 else ''
        print('{}`{}`: {}{:.3%}'.format(spaces, name, tens_digit_space, p))

Pull all of the data from the BookCave database.

In [None]:
conn = sqlite3.connect(os.path.join(BOOKCAVE_ROOT, 'contents.db'))
all_books_df = pd.read_sql_query('SELECT * FROM Books;', conn)
ratings_df = pd.read_sql_query('SELECT * FROM BookRatings;', conn)
levels_df = pd.read_sql_query('SELECT * FROM BookRatingLevels;', conn)
conn.close()

### Books

Count the total number of books in the database.

In [None]:
len(all_books_df)

Consider only books which have at least one rating.

In [None]:
rated_books_df = all_books_df[all_books_df['community_ratings_count'] > 0]
len(rated_books_df)

See some samples of book data.

In [None]:
rated_books_df.head(3)

View the distribution of vendors who offer each book.

In [None]:
rated_books_df.count()

Plot distribution of overall (average) ratings for books.

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']
# rating_text_colors = ['w', 'w', 'k', 'k', 'k', 'k', 'w']


def plot_and_print_average_ratings_distribution(books_df):
    average_ratings = books_df['community_average_rating']
    labels = [rating_to_index[rating] for rating in average_ratings]
    title = 'Average Ratings'
    ylabel = '# of books'
    color = rating_colors
    text_color = 'k'
    plot_label_distribution(ratings, labels, title, ylabel, color, text_color)
    print_label_distribution(ratings, labels)

In [None]:
plot_and_print_average_ratings_distribution(rated_books_df)

Calculate percentage of books whose previews are offered by Amazon.

In [None]:
num_kindle = len(rated_books_df[rated_books_df['amazon_kindle_url'].notnull()])
'{:.3%} of rated books ({:d}/{:d}) are available on Amazon Kindle.'.format(num_kindle / len(rated_books_df), num_kindle, len(rated_books_df))

### Ratings

Count the total number of ratings for books.

In [None]:
len(ratings_df)

See some examples of ratings data.

In [None]:
ratings_df.head()

### (Rating) Levels

Count the total number of rating levels (which belong to categories) given to books.

In [None]:
len(levels_df)

See some examples of rating levels.

In [None]:
levels_df.head()

Check for invalid rows.

In [None]:
len(levels_df[levels_df['count'].isnull()])

### Categories

Many levels are mutually exclusive for a particular book. They are grouped into content 'categories'.

In [None]:
categories_df = pd.read_csv(os.path.join(CONTENT_ROOT, 'bookcave', 'categories.tsv'), sep='\t')
len(categories_df)

See examples of categorized rating levels.

In [None]:
categories_df.head(9)

Get the list of unique category names.

In [None]:
categories = list(categories_df['category'].unique())
categories

Create index maps of categories and levels to speed up operations later.

In [None]:
# Map category names to their indices.
category_to_index = dict()
# Map level names to their indices.
level_to_index = dict()
for i, category in enumerate(categories):
    category_to_index[category] = i
    category_rows = categories_df[categories_df['category'].str.match(category)]
    category_levels = category_rows['level']
    category_level_to_index = {name: j for j, name in enumerate(category_levels)}
    level_to_index.update(category_level_to_index)

In [None]:
# Map each level to its category index.
level_to_category_index = dict()
# Enumerate the level names per category.
levels = [['None'] for _ in range(len(categories))]
for _, category_row in categories_df.iterrows():
    level = category_row['level']
    if level == 'None':
        continue
    level_to_category_index[level] = category_to_index[category_row['category']]
    category_index = level_to_category_index[level]
    levels[category_index].append(level)

## Features

### Text

Count how many Amazon Kindle texts have been attempted to be collected.

In [None]:
text_book_ids = os.listdir(BOOKCAVE_AMAZON_KINDLE_ROOT)
len(text_book_ids), text_book_ids[42]

Extract raw Kindle text contents.

In [None]:
book_id_to_text = dict()
pbar = tqdm.tqdm(total=len(text_book_ids), position=0)
for text_book_id in text_book_ids:
    pbar.update(1)
    if sys.platform == 'win32':
        # One book folder is named:
        # `diy-body-care-the-complete-body-care-guide-for-beginners-with-over-37-recipes-for-homemade-body-butters-body-scrubs-lotions-lip-balms-and-shampoos-body-care-essential-oils-organic-lotions`.
        # To overcome a `FileNotFoundError` for this file, use an extended-length path on Windows.
        # See `https://stackoverflow.com/questions/36219317/pathname-too-long-to-open/36219497`.
        path = u'\\\\?\\' + os.path.abspath(os.path.join(BOOKCAVE_AMAZON_KINDLE_ROOT, text_book_id, 'text.txt'))
    #elif sys.platform == 'darwin':
    else:
        path = os.path.join(BOOKCAVE_AMAZON_KINDLE_ROOT, text_book_id, 'text.txt')
    try:
        with open(path, 'r', encoding='utf-8') as fd:
            contents = fd.read()
        book_id_to_text[text_book_id] = contents
    except FileNotFoundError:
        pass
    except NotADirectoryError:
        pass
pbar.close()
len(book_id_to_text)

See a small exerpt of a Kindle text.

In [None]:
book_id_to_text['gate-76'][:100]

Count how many Kindle texts exist for books with at least one rating.

In [None]:
books_df = rated_books_df[rated_books_df['id'].isin(book_id_to_text)]
len(books_df)

In [None]:
plot_and_print_average_ratings_distribution(books_df)

Map book IDs to indices.

In [None]:
book_to_index = {book_id: i for i, book_id in enumerate(books_df['id'])}

Likewise, create a fancy-indexable array of book IDs.

In [None]:
book_ids = np.array(sorted([book_row['id'] for _, book_row in books_df.iterrows()]))

What does the file-length distribution look like?

In [None]:
raw_text_lengths = [len(book_id_to_text[book_id]) for book_id in book_ids]
n, bins, patches = plt.hist(raw_text_lengths, 60, density=True)
plt.show()

View book titles with little text.

In [None]:
[book_id for book_id in book_ids if len(book_id_to_text[book_id]) <= 15000]

### Labels

For each category, calculate the average rating for each book.

In [None]:
y_cont = np.zeros((len(books_df), len(categories)))
# Add all levels together for each book.
print('Adding...')
for _, level_row in levels_df.iterrows():
    book_id = level_row['book_id']
    # Skip books which have a rating (and rating levels), but no preview text.
    if book_id not in book_to_index:
        continue
    # Add this rating level to its category for this book.
    book_index = book_to_index[book_id]
    category_index = level_to_category_index[level_row['title']]
    level_index = level_to_index[level_row['title']]
    y_cont[book_index, category_index] += level_index * level_row['count']
# Calculate the average level for each book by dividing by the number of ratings for that book.
print('Averaging...')
for _, book_row in books_df.iterrows():
    book_id = book_row['id']
    book_index = book_to_index[book_id]
    y_cont[book_index] /= book_row['community_ratings_count']
print('Done!')
y_cont.shape

See an example of book rating levels.

In [None]:
levels_df[levels_df['book_id'] == 'gate-76']

See how these rating levels are now represented as a vector of average rating levels per category.

In [None]:
y_cont[book_to_index['gate-76']]

Since false negatives are less desirable than false positives, implement somewhat of a "harsh critic" by taking the ceiling of the average ratings.

In [None]:
y = np.ceil(y_cont).astype(np.int32)
y[book_to_index['gate-76']]

Plot the label distribution for each category.

In [None]:
category_colors = ['blue', 'green', 'pink', 'cyan', 'magenta', 'yellow', 'orange', 'red']
category_text_colors = ['w', 'w', 'k', 'k', 'k', 'k', 'k', 'w']

        
def plot_and_print_label_distributions():
    for category_index, category in enumerate(categories):
        x = levels[category_index]
        labels = y[:, category_index]
        title = category
        ylabel = '# of occurrences'
        color = category_colors[category_index]
        text_color = category_text_colors[category_index]
        save = os.path.join(FIGURES_ROOT, 'kindle_label_dist_{}_{}.png'.format(category_index + 1, category))
        plot_label_distribution(x, labels, title, ylabel, color, text_color, save=save)
        print_label_distributions(x, labels)

In [None]:
plot_and_print_label_distributions()