In [None]:
%cd ../..

# charts

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
import sklearn

from python import folders
from python.util.evaluation import METRIC_NAMES
from python.sites.bookcave import bookcave

In [None]:
category_names = [bookcave.CATEGORY_NAMES[category] for category in bookcave.CATEGORIES] + ['Average']
category_to_index = {category: i for i, category in enumerate(bookcave.CATEGORIES)}

## Paragraphs

In [None]:
metric_names = ['Accuracy', 'F1 Macro', 'MSE']


def read_model_category_metrics(path):
    with open(path, 'r', encoding='utf-8') as fd:
        n_models = int(fd.readline()[:-1])
        model_names = []
        for m in range(n_models):
            model_name = fd.readline()[:-1]
            model_names.append(model_name)
        model_category_metrics = []
        for m in range(n_models):
            category_metrics = []
            for j in range(len(category_names)):
                metrics = []
                for i in range(len(metric_names)):
                    metrics.append(float(fd.readline()[:-1]))
                category_metrics.append(metrics)
            model_category_metrics.append(category_metrics)
        return model_names, np.array(model_category_metrics)

In [None]:
paragraphs_baselines_path = os.path.join(folders.INPUT_PATH, 'predict_paragraphs_baselines.txt')
baseline_names, baseline_category_metrics = read_model_category_metrics(paragraphs_baselines_path)
baseline_category_metrics.shape

In [None]:
# models/paragraph_cnn_max_ordinal/33063788_overall_max-agg.h5
# models/paragraph_rnn_max_ordinal/33063789_overall_max-agg.h5
# models/paragraph_rnncnn_max_ordinal/33063790_overall_max-agg.h5
paragraphs_path = os.path.join(folders.INPUT_PATH, 'predict_paragraphs.txt')
model_names, model_category_metrics = read_model_category_metrics(paragraphs_path)
model_category_metrics.shape

In [None]:
classifier_names = baseline_names + model_names
classifier_category_metrics = np.concatenate([baseline_category_metrics, model_category_metrics])
classifier_category_metrics.shape

## Books

In [None]:
def read_model_category_metrics_book(path, metric_indices):
    with open(path, 'r', encoding='utf-8') as fd:
        n_models = int(fd.readline()[:-1])
        model_names = []
        for m in range(n_models):
            model_name = fd.readline()[:-1]
            model_names.append(model_name)
        model_category_metrics = []
        for m in range(n_models):
            category_metrics = []
            for j in range(len(category_names)):
                metrics = []
                all_metrics = [float(value.strip()) for value in fd.readline()[:-1].split('|')[1:-1]]
                for index in metric_indices:
                    metrics.append(all_metrics[index])
                category_metrics.append(metrics)
            model_category_metrics.append(category_metrics)
        return model_names, np.array(model_category_metrics)

In [None]:
metric_indices = [0, 3, 7]  # [Accuracy, F1 Macro, MSE]
books_path = os.path.join(folders.INPUT_PATH, 'predict_books.txt')
model_names_book, model_category_metrics_book = read_model_category_metrics_book(books_path, metric_indices)
model_category_metrics_book.shape

In [None]:
books_baselines_path = os.path.join(folders.INPUT_PATH, 'predict_books_baselines.txt')
baseline_names_book, baseline_category_metrics_book = read_model_category_metrics_book(books_baselines_path, metric_indices)
baseline_category_metrics_book.shape

In [None]:
classifier_names_book = baseline_names_book + model_names_book
classifier_category_metrics_book = np.concatenate([baseline_category_metrics_book, model_category_metrics_book])
classifier_category_metrics_book.shape

## Plot

In [None]:
def plot_bar(classifier_values, classifier_names, tick_names, title, ylabel, xlabel=None, legend=False, save_path=None, figsize=(16, 4.8), gap=.15):
    plt.figure(figsize=figsize)
    ticks = np.arange(len(tick_names))
    width = (1 - gap) / len(classifier_values)
    for i, values in enumerate(classifier_values):
        plt.bar(ticks + i * width - (1 - gap) / 2 + width / 2, values, width=width)
    plt.xticks(ticks, tick_names, rotation=-16.875, ha='left')
    plt.title(title)
    plt.ylabel(ylabel)
    if xlabel is not None:
        plt.xlabel(xlabel)
    if legend:
        plt.legend(classifier_names, loc='upper center', bbox_to_anchor=(.5, -0.25), ncol=5)
    if save_path is not None:
        plt.savefig(save_path, bbox_inches='tight')
    plt.show()

In [None]:
for i in range(classifier_category_metrics.shape[2]):
    title = '{} of All Classifiers over Individual Paragraphs by Category'.format(metric_names[i])
    ylabel = metric_names[i]
    legend = i == 2
    save_path = os.path.join(folders.FIGURES_PATH, 'classifier_category_metrics_{:d}'.format(i))
    plot_bar(classifier_category_metrics[:, :, i],
             classifier_names,
             category_names[:],
             title, ylabel,
             legend=legend,
             save_path=save_path)

In [None]:
for i in range(classifier_category_metrics_book.shape[2]):
    title = '{} of All Classifiers over Entire Books by Category'.format(metric_names[i])
    ylabel = metric_names[i]
    legend = i == 2
    save_path = os.path.join('..', 'figures', 'classifier_category_metrics_book_{:d}'.format(i))
    plot_bar(classifier_category_metrics_book[:, :, i],
             classifier_names_book,
             category_names,
             title,
             ylabel,
             legend=legend,
             save_path=save_path)

Simplify.

In [None]:
hatches = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*']

In [None]:
def plot_average(metric_values, names, titles, ylabels, xlabels=None, save_path=None, figsize=(12, 4.8)):
    fig, axes = plt.subplots(1, len(metric_values), figsize=figsize)
    for i, values in enumerate(metric_values):
        for j, value in enumerate(values):
            axes[i].bar(j, value, color='w', hatch=hatches[7 * j % len(hatches)], edgecolor='k')
        axes[i].set_title(titles[i])
        axes[i].set_xticklabels([' ']*len(values))
        axes[i].set_ylabel(ylabels[i])
        if xlabels is not None:
            axes[i].set_xlabel(xlabels[i])
#     fig.legend(names, loc='upper center', bbox_to_anchor=(.5, -0.25), ncol=5)
    legend = fig.legend(names, loc='lower center', bbox_to_anchor=(.52, -.0025), ncol=5, fontsize='medium')
#     fig.legend(names, loc='center right', bbox_to_anchor=(1.35, .5), fontsize='xx-large', markerscale=8)
    for i, patch in enumerate(legend.get_patches()):
        patch.set_y(patch.get_y() - patch.get_height() / 2 - (patch.get_height() * .1 if i % 2 == 1 else 0))
        patch.set_height(patch.get_height() * 2)
    fig.tight_layout()
    if save_path is not None:
        plt.savefig(save_path, bbox_inches='tight')
    plt.show()

In [None]:
plot_average([classifier_category_metrics[:, -1, 0], classifier_category_metrics[:, -1, 1]],
             classifier_names,
             ['Classification Accuracy for Paragraphs', 'Macro-averaged F1 Score for Paragraphs'],
             ['Accuracy', 'F1 Score'],
             xlabels=[' ', ' '],
             save_path=os.path.join(folders.FIGURES_PATH, 'results_average_paragraph'))

In [None]:
plot_average([classifier_category_metrics_book[:, -1, 0], classifier_category_metrics_book[:, -1, 1]],
             classifier_names_book,
             ['Classification Accuracy for Books', 'Macro-averaged F1 Score for Books'],
             ['Accuracy', 'F1 Score'],
             xlabels=[' ', ' '],
             save_path=os.path.join(folders.FIGURES_PATH, 'results_average_book'))

Show a LaTeX table with the results.

In [None]:
for m in range(2):
    for j, category in enumerate(category_names):
        best_score = max(classifier_category_metrics_book[:, j, m])
        print('\multicolumn{{1}}{{|r|}}{{{}}}'.format(category.replace('&', '\\&')), end='')
        for i, classifier in enumerate(classifier_names_book):
            score = classifier_category_metrics_book[i, j, m]
            bold_start = '\\underline{' if score == best_score else ''
            bold_end = '}' if score == best_score else ''
            print(' & {}{:.4f}{}'.format(bold_start, score, bold_end), end='')
        print('\\\\')
    print()

In [None]:
for m in range(2):
    for j, category in enumerate(category_names):
        best_score = max(classifier_category_metrics[:, j, m])
        print('{}'.format(category.replace('&', '\\&')), end='')
        for i, classifier in enumerate(classifier_names):
            score = classifier_category_metrics[i, j, m]
            bold_start = '\\underline{' if score == best_score else ''
            bold_end = '}' if score == best_score else ''
            print(' & {}{:.4f}{}'.format(bold_start, score, bold_end), end='')
        print('\\\\')
    print()