# BookCave

[General FAQ](https://mybookcave.com/reader-faq/)

[BookCave rating system](https://mybookcave.com/reader-faq/what-makes-a-book-earn-a-particular-content-rating/)

## Setup

In [None]:
# Data manipulation.
import sqlite3
import pandas as pd
# File I/O.
import os
import sys
# Math.
import numpy as np
# Text.
import re
import string
# Time management.
import tqdm
# Visualization.
import matplotlib.pyplot as plt
# Natural Language.
import nltk
# nltk.download()
# Learning.
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

Declare file path constants.

In [None]:
EMBEDDINGS_ROOT = os.path.join('..', '..', 'embeddings')
# GLOVE_300 = os.path.join(EMBEDDINGS_ROOT, 'glove.6B.300d.txt')
CONTENT_ROOT = os.path.join('..', 'content')
BOOKCAVE_ROOT = os.path.join(CONTENT_ROOT, 'bookcave')
BOOKCAVE_AMAZON_ROOT = os.path.join(CONTENT_ROOT, 'bookcave_amazon')

Pull all of the data from the BookCave database.

In [None]:
conn = sqlite3.connect(os.path.join(BOOKCAVE_ROOT, 'contents.db'))
all_books = pd.read_sql_query('SELECT * FROM Books;', conn)
ratings = pd.read_sql_query('SELECT * FROM BookRatings;', conn)
levels = pd.read_sql_query('SELECT * FROM BookRatingLevels;', conn)
conn.close()

### Books

Count the total number of books in the database.

In [None]:
len(all_books)

Consider only books which have at least one rating.

In [None]:
rated_books = all_books[all_books['community_ratings_count'] > 0]
len(rated_books)

See some samples of book data.

In [None]:
rated_books.head(3)

View the distribution of vendors who offer each book.

In [None]:
rated_books.count()

Calculate percentage of books whose previews are offered by Amazon.

In [None]:
num_kindle = len(rated_books[rated_books['amazon_kindle_url'].notnull()])
'{:.3%} of rated books ({:d}/{:d}) are available on Amazon Kindle.'.format(num_kindle / len(rated_books), num_kindle, len(rated_books))

### Ratings

Count the total number of ratings for books.

In [None]:
len(ratings)

See some examples of ratings data.

In [None]:
ratings.head()

### (Rating) Levels

Count the total number of rating levels (which belong to categories) given to books.

In [None]:
len(levels)

See some examples of rating levels.

In [None]:
levels.head()

Check for invalid rows.

In [None]:
len(levels[levels['count'].isnull()])

### Categories

Many levels are mutually exclusive for a particular book. They are grouped into content 'categories'.

In [None]:
categories = pd.read_csv(os.path.join(CONTENT_ROOT, 'bookcave', 'categories.tsv'), sep='\t')
len(categories)

See examples of categorized rating levels.

In [None]:
categories.head(9)

Get the list of unique category names.

In [None]:
category_names = categories['category'].unique()
category_names

Create index maps of categories and levels to speed up operations later.

In [None]:
# Map category names to their indices.
category_indices = dict()
# Map level names to their indices.
level_indices = dict()
for i, category in enumerate(category_names):
    category_indices[category] = i
    category_rows = categories[categories['category'].str.match(category)]
    j = 0
    for _, row in category_rows.iterrows():
        level_indices[row['level']] = j
        j += 1

Map each level to its category index.

In [None]:
level_to_category_index = dict()
for _, category_row in categories.iterrows():
    level = category_row['level']
    if level != 'None':
        level_to_category_index[level] = category_indices[category_row['category']]

Count the number of levels in each category.

In [None]:
category_sizes = categories.groupby('category').size()
category_sizes

## Features

### Text

Count how many Amazon Kindle previews have been attempted to be collected.

In [None]:
preview_book_ids = os.listdir(BOOKCAVE_AMAZON_ROOT)
len(preview_book_ids), preview_book_ids[42]

Extract raw book preview contents.

In [None]:
book_id_to_preview = dict()
pbar = tqdm.tqdm(total=len(preview_book_ids), position=0)
for preview_book_id in preview_book_ids:
    pbar.update(1)
    if sys.platform == 'win32':
        # One book folder is named:
        # `diy-body-care-the-complete-body-care-guide-for-beginners-with-over-37-recipes-for-homemade-body-butters-body-scrubs-lotions-lip-balms-and-shampoos-body-care-essential-oils-organic-lotions`.
        # To overcome a `FileNotFoundError` for this file, use an extended-length path on Windows.
        # See `https://stackoverflow.com/questions/36219317/pathname-too-long-to-open/36219497`.
        path = u'\\\\?\\' + os.path.abspath(os.path.join(BOOKCAVE_AMAZON_ROOT, preview_book_id, 'book.txt'))
    #elif sys.platform == 'darwin':
    else:
        path = os.path.join(BOOKCAVE_AMAZON_ROOT, preview_book_id, 'book.txt')
    try:
        with open(path, 'r', encoding='utf-8') as fd:
            contents = fd.read()
            # Skip empty text files.
            if len(contents) == 0:
                continue
            book_id_to_preview[preview_book_id] = contents
    except FileNotFoundError:
        pass
pbar.close()
len(book_id_to_preview)

See a small exerpt of a preview.

In [None]:
book_id_to_preview['the-art-of-love'][:100]

Count how many previews exist for books with at least one rating.

In [None]:
books = rated_books[rated_books['id'].isin(book_id_to_preview)]
len(books)

Map book IDs to indices.

In [None]:
book_indices = dict()
i = 0
for _, book in books.iterrows():
    book_indices[book['id']] = i
    i += 1

Likewise, create a fancy-indexable array of book IDs.

In [None]:
book_ids = np.array([book['id'] for _, book in books.iterrows()])

What does the file-length distrubution look like?

In [None]:
raw_preview_lengths = [len(book_id_to_preview[book_id]) for book_id in book_ids]
n, bins, patches = plt.hist(raw_preview_lengths, 200, density=True)
plt.show()

### Labels

For each category, calculate the average rating for each book.

In [None]:
y_cont = np.zeros((len(books), len(category_names)))
# Add all levels together for each book.
print('Adding...')
for _, level in levels.iterrows():
    book_id = level['book_id']
    # Skip books which have a rating (and rating levels), but no preview text.
    if book_id in book_indices:
        # Add this rating level to its category for this book.
        book_index = book_indices[book_id]
        category_index = level_to_category_index[level['title']]
        level_index = level_indices[level['title']]
        y_cont[book_index, category_index] += level_index * level['count']
# Calculate the average level for each book by dividing by the number of ratings for that book.
print('Averaging...')
for _, book in books.iterrows():
    book_id = book['id']
    book_index = book_indices[book_id]
    y_cont[book_index] /= book['community_ratings_count']
print('Done!')
y_cont.shape

See an example of book rating levels.

In [None]:
levels[levels['book_id'] == 'the-art-of-love']

See how these rating levels are now represented as a vector of average rating levels per category.

In [None]:
y_cont[book_indices['the-art-of-love']]

Since false negatives are less desirable than false positives, implement somewhat of a "harsh critic" by taking the ceiling of the average ratings.

In [None]:
y = np.ceil(y_cont).astype(np.int32)
y[book_indices['the-art-of-love']]

## Model

In [None]:
def get_classifier():
    return MultinomialNB()
#     return RandomForestClassifier()

In [None]:
def get_vectorizer():
    return sklearn.feature_extraction.text.TfidfVectorizer()

Generate a random permutation in order to process the data set in a random order.

In [None]:
np.random.seed(1)
perm = np.random.permutation(len(y))

Cross validate...

In [None]:
def get_train_test_split(perm, fold, folds):
    test_start = len(y) * fold // folds
    test_end = len(y) * (fold + 1) // folds
    perm_train = np.concatenate((perm[:test_start], perm[test_end:]))
    perm_test = perm[test_start:test_end]
    previews_train = [book_id_to_preview[book_id] for book_id in book_ids[perm_train]]
    previews_test = [book_id_to_preview[book_id] for book_id in book_ids[perm_test]]
    y_train = y[perm_train,category_index]
    y_test = y[perm_test,category_index]
    return previews_train, previews_test, y_train, y_test

In [None]:
folds = 5

and use ordinal classification as explained in `Frank, Eibe, and Mark Hall. "A simple approach to ordinal classification." European Conference on Machine Learning. Springer, Berlin, Heidelberg, 2001.`.

In [None]:
def to_ordinal(y, ordinal_index):
    return np.array([1 if level > ordinal_index else 0 for level in y])

In [None]:
for category_index, category_name in enumerate(category_names):
    print('Evaluating category `{}`...'.format(category_name))
    category_size = category_sizes[category_name]
    # Start cross-validation.
    num_correct_total = 0
    for fold in range(folds):
        print('Starting fold {}...'.format(fold + 1))
        # Split data into train and test sets for this fold.
        previews_train, previews_test, y_train, y_test = get_train_test_split(perm, fold, folds)
        # Create vectorized representations of the book previews.
        vectorizer = get_vectorizer()
        vectorizer.fit(previews_train)  # Be fair, as if we were only allowed to model the training data.
        X_train = vectorizer.transform(previews_train)
        X_test = vectorizer.transform(previews_test)
        # Perform ordinal classification.
        # Get probabilities for binarized ordinal labels.
        ordinal_ps = np.zeros((len(y_test), category_size - 1))
        for ordinal_index in range(category_size - 1):
            # Find P(Target > Class_k) for 0..(k-1)
            classifier = get_classifier()
            y_train_ordinal = to_ordinal(y_train, ordinal_index)
            classifier.fit(X_train, y_train_ordinal)
            ordinal_ps[:,ordinal_index] = classifier.predict(X_test)
        # Calculate the actual class label probabilties.
        ps = np.zeros((len(y_test), category_size))
        for level_index in range(category_size):
            if level_index == 0:
                ps[:,level_index] = 1 - ordinal_ps[:,0]
            elif level_index == category_size - 1:
                ps[:,level_index] = ordinal_ps[:,level_index - 1]
            else:
                ps[:,level_index] = ordinal_ps[:,level_index - 1] - ordinal_ps[:,level_index]
        # Choose the most likely class label.
        y_pred = np.argmax(ps, axis=1)
        num_correct = sklearn.metrics.accuracy_score(y_test, y_pred, normalize=False)
        num_correct_total += num_correct
    accuracy = num_correct_total/len(y)
    print('Accuracy: {:.4%}'.format(accuracy))