# predict_paragraphs_baselines

In [None]:
import os

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from classification import baselines, evaluation, ordinal, shared_parameters
import folders
import predict_paragraphs
from sites.bookcave import bookcave

## Load paragraphs

In [None]:
source = 'paragraph_tokens'
subset_ratio = shared_parameters.DATA_SUBSET_RATIO
subset_seed = shared_parameters.DATA_SUBSET_SEED
min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
min_tokens = shared_parameters.DATA_MIN_TOKENS
categories_mode = shared_parameters.DATA_CATEGORIES_MODE
return_overall = shared_parameters.DATA_RETURN_OVERALL
inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df = \
    bookcave.get_data({source},
                      subset_ratio=subset_ratio,
                      subset_seed=subset_seed,
                      min_len=min_len,
                      max_len=max_len,
                      min_tokens=min_tokens,
                      categories_mode=categories_mode,
                      return_overall=return_overall,
                      return_meta=True)
text_source_tokens = list(zip(*inputs[source]))[0]
len(text_source_tokens)

## Paragraph labels

In [None]:
predict_locations = []
predict_tokens = []
predict_source_labels = []
for text_i, source_tokens in enumerate(text_source_tokens):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    category_labels = [bookcave.get_labels(asin, category) for category in categories]
    if any(labels is None for labels in category_labels):
        continue
    for source_i, tokens in enumerate(source_tokens):
        source_labels = [labels[source_i] for labels in category_labels]
        if any(label == -1 for label in source_labels):
            continue
        predict_locations.append((text_i, source_i))
        predict_tokens.append(tokens)
        predict_source_labels.append(source_labels)

In [None]:
Q_true = np.array(predict_source_labels).transpose()
Q_true.shape

In [None]:
seed = 1
category_balanced_indices = [predict_paragraphs.get_balanced_indices(q_true, minlength=len(category_levels[j]), seed=seed)
                             for j, q_true in enumerate(Q_true)]

### Zero Rule

In [None]:
def predict_zero_r(category_indices=None):
    category_metrics_zero = []
    print('ZeroR')
    for j, category in enumerate(categories):
        print()
        print(category)
        q_true = Q_true[j]
        if category_indices is not None:
            q_true = q_true[category_indices[j]]
        q_pred_zero = [np.argmax(np.bincount(q_true, minlength=len(category_levels[j])))]*len(q_true)
        confusion_zero, metrics_zero = evaluation.get_confusion_and_metrics(q_true, q_pred_zero)
        print(confusion_zero)
        print(metrics_zero[0])
        category_metrics_zero.append(metrics_zero)
    print()
    print('Average')
    metrics_avg_zero = [sum([metrics_zero[i] for metrics_zero in category_metrics_zero[:-1]])/(len(category_metrics_zero) - 1)
                        for i in range(len(category_metrics_zero[0]))]
    print(metrics_avg_zero[0])

In [None]:
predict_zero_r()

In [None]:
predict_zero_r(category_indices=category_balanced_indices)

### Bag-of-words (count-based)

In [None]:
def identity(v):
    return v

In [None]:
max_words = shared_parameters.TEXT_MAX_WORDS
vectorizer = TfidfVectorizer(
    preprocessor=identity,
    tokenizer=identity,
    analyzer='word',
    token_pattern=None,
    max_features=max_words,
    norm='l2',
    sublinear_tf=True)
text_tokens = []
for source_tokens in text_source_tokens:
    all_tokens = []
    for tokens in source_tokens:
        all_tokens.extend(tokens)
    text_tokens.append(all_tokens)
X_w = vectorizer.fit_transform(text_tokens)
len(vectorizer.get_feature_names())

In [None]:
test_size = shared_parameters.EVAL_TEST_SIZE  # b
test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
Y_T = Y.transpose()  # (n, c)
X_w_train, _, Y_train_T, _ = train_test_split(X_w, Y_T, test_size=test_size, random_state=test_random_state)
Y_train = Y_train_T.transpose()  # (c, n * (1 - b))

Predict.

In [None]:
def predict_baselines(create_model, P_w_predict, category_indices=None):
    print()
    print(create_model.__name__[7:])

    # Fit models.
    category_classifiers = []
    for j, category in enumerate(categories):
        y_train = Y_train[j]
        k = len(category_levels[j])
        classifiers = baselines.fit_ordinal(create_model, X_w_train, y_train, k)
        category_classifiers.append(classifiers)

    # Predict.
    Q_w_pred = []
    for j, classifiers in enumerate(category_classifiers):
        k = len(category_levels[j])
        if category_indices is not None:
            q_w_pred = baselines.predict_ordinal(classifiers, P_w_predict[category_indices[j]], k)
        else:
            q_w_pred = baselines.predict_ordinal(classifiers, P_w_predict, k)
        Q_w_pred.append(q_w_pred)

    # Evaluate.
    category_metrics = []
    for j, category in enumerate(categories):
        print()
        print(category)
        q_true = Q_true[j]
        if category_indices is not None:
            q_true = q_true[category_indices[j]]
        q_w_pred = Q_w_pred[j]
        confusion, metrics = evaluation.get_confusion_and_metrics(q_true, q_w_pred)
        print(confusion)
        print(metrics[0])
        category_metrics.append(metrics)

    # Average.
    print()
    print('Average')
    metrics_avg = [sum([metrics[i] for metrics in category_metrics[:-1]])/(len(category_metrics) - 1)
                   for i in range(len(category_metrics[0]))]
    print(metrics_avg[0])

In [None]:
create_models = [
    baselines.create_k_nearest_neighbors,
    baselines.create_linear_regression,
    baselines.create_logistic_regression,
    baselines.create_multinomial_naive_bayes,
    baselines.create_random_forest,
    baselines.create_svm]
P_w_predict = vectorizer.transform(predict_tokens)
for create_model in create_models:
    predict_baselines(create_model, P_w_predict)
    print()
    print('Balanced')
    predict_baselines(create_model, P_w_predict, category_indices=category_balanced_indices)

### MLP

In [None]:
mlp_path = os.path.join(folders.MODELS_PATH, 'multi_layer_perceptron', 'ordinal', '32662682.h5')
predict_paragraphs.load_model_and_evaluate(mlp_path,
                                           P_w_predict,
                                           Q_true,
                                           categories,
                                           overall_last=return_overall)
predict_paragraphs.load_model_and_evaluate(mlp_path,
                                           P_w_predict,
                                           Q_true,
                                           categories,
                                           overall_last=return_overall,
                                           category_indices=category_balanced_indices)