In [None]:
%cd ../..

# window_charts

In [None]:
import os

from matplotlib import pyplot as plt

from python import folders
from python.sites.bookcave import bookcave
from python.util import evaluation

## Write

In [None]:
CLASSIFIERS = [
    'k_nearest_neighbors',
    'logistic_regression',
    'multi_layer_perceptron',
    'multinomial_naive_bayes',
    'random_forest',
    'svm',
    'book_net'
]
CLASSIFIER_NAMES = [
    'K Nearest Neighbors',
    'Logistic Regression',
    'Multi-layer Perceptron',
    'Multinomial Naïve Bayes',
    'Random Forest',
    'SVM',
    'BookNet'
]
BASELINES_FILE_NAME = '36100418'
NET_FILE_NAMES = [
    '35082769',  # remove 3
    '35082760_trainemb',  # remove 3
    '35082771',
    '35082762_trainemb',
    '35082763_trainemb',  # remove 3
    '35082764_trainemb',
    '35082765_trainemb',
    '35082776'
]
WINDOW_SIZES = [1, 3, 5, 7]

In [None]:
def find_results_line(path, category):
    with open(path, 'r') as fd:
        for line in fd:
            if line.startswith(category):
                return line
    return None


def write_size_results_lines(fd, classifier, base_file_name, j):
    for size in WINDOW_SIZES:
        file_name = '{}_{:d}_{:d}w.txt'.format(base_file_name, j, size)
        path = os.path.join(folders.LOGS_PATH, classifier, file_name)
        line = find_results_line(path, bookcave.CATEGORIES[j])
        stripped_line = line[line.index('|')+1:line.rindex('|')].replace(' ', '')
        fd.write(stripped_line + '\n')


# Write.
for j in range(len(bookcave.CATEGORIES)):
    window_path = os.path.join(folders.INPUT_PATH, 'window_{:d}j.txt'.format(j))
    with open(window_path, 'w') as fd:
        for i, classifier in enumerate(CLASSIFIERS):
            fd.write(CLASSIFIER_NAMES[i] + '\n')
            if classifier == 'book_net':
                write_size_results_lines(fd, 'paragraph_max_ordinal', NET_FILE_NAMES[j], j)
            else:
                write_size_results_lines(fd, classifier, BASELINES_FILE_NAME, j)        

## Read and plot

In [None]:
# Read.
category_classifier_window_metrics = list()
for j, category in enumerate(bookcave.CATEGORIES):
    window_path = os.path.join(folders.INPUT_PATH, 'window_{:d}j.txt'.format(j))
    with open(window_path, 'r') as fd:
        classifier_window_metrics = list()
        for i in range(len(CLASSIFIERS)):
            _ = fd.readline()[:-1]  # classifier_name
            window_metrics = list()
            for w in range(len(WINDOW_SIZES)):
                metrics = [float(metric) for metric in fd.readline()[:-1].split('|')]
                window_metrics.append(metrics)
            classifier_window_metrics.append(window_metrics)
        category_classifier_window_metrics.append(classifier_window_metrics)

# Calculate average metrics over non-overall categories.
average_classifier_window_metrics = list()
for i in range(len(CLASSIFIERS)):
    window_metrics = list()
    for w in range(len(WINDOW_SIZES)):
        average_metrics = list()
        for m in range(len(evaluation.METRIC_NAMES)):
            category_metrics = [category_classifier_window_metrics[j][i][w][m]
                                for j in range(len(bookcave.CATEGORIES[:-1]))]
            average_metrics.append(sum(category_metrics)/len(category_metrics))
        window_metrics.append(average_metrics)
    average_classifier_window_metrics.append(window_metrics)
category_classifier_window_metrics.append(average_classifier_window_metrics)

In [None]:
# M = 7  # Mean absolute error
M = 8  # Mean squared error

CATEGORY_NAMES = [bookcave.CATEGORY_NAMES[c] for c in bookcave.CATEGORIES] + ['Average']
gap = 0.2  # Between groups of bars.
group_width = 1 - gap
W = len(WINDOW_SIZES)
bar_width = group_width / W

for j, category_name in enumerate(CATEGORY_NAMES):
    classifier_window_metrics = category_classifier_window_metrics[j]
    
    # Plot.
    plt.figure(figsize=(12, 7.2))
    for i in range(len(CLASSIFIER_NAMES)):
        window_metrics = classifier_window_metrics[i]
        x = [i + w*bar_width - group_width/2 + bar_width/2 for w in range(W)]
        y = [metrics[M] for metrics in window_metrics]
        plt.bar(x, y, width=bar_width)
    plt.title(evaluation.METRIC_ABBREVIATIONS[M] +
              ' of Books Over Paragraph Window Sizes ' +
              ', '.join([str(size) for size in WINDOW_SIZES]) +
              ' - ' + category_name)
    plt.ylabel('Mean Squared Error')
    plt.xticks(list(range(len(CLASSIFIER_NAMES))), CLASSIFIER_NAMES, ha='left', rotation=-22.5)
    plt.show()
    
    # Print.
    classifier_name_max_len = max(map(len, CLASSIFIER_NAMES))
    print('{:>{w}}'.format('CLASSIFIER', w=classifier_name_max_len), end='')
    for size in WINDOW_SIZES:
        print(' | {:^5d}'.format(size), end='')
    print()
    print('-'*classifier_name_max_len, end='')
    for _ in WINDOW_SIZES:
        print('-+-' + '-'*5, end='')
    print()
    for i, window_metrics in enumerate(classifier_window_metrics):
        classifier_name = CLASSIFIER_NAMES[i]
        print('{:>{w}}'.format(classifier_name, w=classifier_name_max_len), end='')
        for w, metrics in enumerate(window_metrics):
            size = WINDOW_SIZES[w]
            print(' | {:.3f}'.format(metrics[M]), end='')
        print()