# predict_paragraphs_baselines

In [1]:
import os

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from classification import baselines, evaluation, ordinal, shared_parameters
import folders
import predict_paragraphs
from sites.bookcave import bookcave

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load paragraphs

In [2]:
source = 'paragraph_tokens'
subset_ratio = shared_parameters.DATA_SUBSET_RATIO
subset_seed = shared_parameters.DATA_SUBSET_SEED
min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
min_tokens = shared_parameters.DATA_MIN_TOKENS
categories_mode = shared_parameters.DATA_CATEGORIES_MODE
inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df = \
    bookcave.get_data({source},
                      subset_ratio=subset_ratio,
                      subset_seed=subset_seed,
                      min_len=min_len,
                      max_len=max_len,
                      min_tokens=min_tokens,
                      categories_mode=categories_mode,
                      return_meta=True)
text_source_tokens = list(zip(*inputs[source]))[0]
len(text_source_tokens)

6393

## Paragraph labels

In [3]:
predict_locations = []
predict_tokens = []
predict_source_labels = []
for text_i, source_tokens in enumerate(text_source_tokens):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    category_labels = [bookcave.get_labels(asin, category) for category in categories]
    if any(labels is None for labels in category_labels):
        continue
    for source_i, tokens in enumerate(source_tokens):
        source_labels = [labels[source_i] for labels in category_labels]
        if any(label == -1 for label in source_labels):
            continue
        predict_locations.append((text_i, source_i))
        predict_tokens.append(tokens)
        predict_source_labels.append(source_labels)

In [4]:
Q_true = np.array(predict_source_labels).transpose()
Q_true.shape

(8, 796)

In [5]:
q_true_overall = bookcave.get_overall_y(Q_true)

In [7]:
seed = 1
category_balanced_indices = [predict_paragraphs.get_balanced_indices(q_true, minlength=len(category_levels[j]), seed=seed)
                             for j, q_true in enumerate(Q_true)]

### Zero Rule

In [12]:
def predict_zero_r(category_indices=None):
    category_metrics_zero = []
    print('ZeroR')
    for j, category in enumerate(categories):
        print()
        print(category)
        q_true = Q_true[j]
        if category_indices is not None:
            q_true = q_true[category_indices[j]]
        q_pred_zero = [np.argmax(np.bincount(q_true, minlength=len(category_levels[j])))]*len(q_true)
        confusion_zero, metrics_zero = evaluation.get_confusion_and_metrics(q_true, q_pred_zero)
        print(confusion_zero)
        print(metrics_zero[0])
        category_metrics_zero.append(metrics_zero)
    print()
    print('Average')
    metrics_avg_zero = [sum([metrics_zero[i] for metrics_zero in category_metrics_zero])/len(category_metrics_zero)
                        for i in range(len(category_metrics_zero))]
    print(metrics_avg_zero[0])
    print()
    print('Overall')
    q_pred_overall_zero = [np.argmax(np.bincount(q_true_overall))]*len(q_true_overall)
    confusion_overall_zero, metrics_overall_zero = evaluation.get_confusion_and_metrics(q_true_overall, q_pred_overall_zero)
    print(confusion_overall_zero)
    print(metrics_overall_zero[0])

In [13]:
predict_zero_r()

ZeroR

crude_humor_language
[[414   0   0   0]
 [  4   0   0   0]
 [256   0   0   0]
 [122   0   0   0]]
0.5201005025125628

drug_alcohol_tobacco_use
[[774   0   0   0]
 [ 15   0   0   0]
 [  5   0   0   0]
 [  2   0   0   0]]
0.9723618090452262

kissing
[[729   0]
 [ 67   0]]
0.9158291457286433

profanity
[[402   0   0   0]
 [  6   0   0   0]
 [294   0   0   0]
 [ 94   0   0   0]]
0.5050251256281407

nudity
[[651   0   0   0]
 [  4   0   0   0]
 [ 18   0   0   0]
 [123   0   0   0]]
0.8178391959798995

sex_and_intimacy
[[565   0   0   0]
 [ 26   0   0   0]
 [ 57   0   0   0]
 [148   0   0   0]]
0.7097989949748744

violence_and_horror
[[723   0   0   0]
 [ 47   0   0   0]
 [ 22   0   0   0]
 [  4   0   0   0]]
0.9082914572864321

gay_lesbian_characters
[[778   0   0]
 [  2   0   0]
 [ 16   0   0]]
0.9773869346733668

Average
0.7908291457286432

Overall
[[363   0   0   0]
 [ 30   0   0   0]
 [216   0   0   0]
 [187   0   0   0]]
0.45603015075376885


In [14]:
predict_zero_r(category_indices=category_balanced_indices)

ZeroR

crude_humor_language
[[4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]]
0.25

drug_alcohol_tobacco_use
[[2 0 0 0]
 [2 0 0 0]
 [2 0 0 0]
 [2 0 0 0]]
0.25

kissing
[[67  0]
 [67  0]]
0.5

profanity
[[6 0 0 0]
 [6 0 0 0]
 [6 0 0 0]
 [6 0 0 0]]
0.25

nudity
[[4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]]
0.25

sex_and_intimacy
[[26  0  0  0]
 [26  0  0  0]
 [26  0  0  0]
 [26  0  0  0]]
0.25

violence_and_horror
[[4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]
 [4 0 0 0]]
0.25

gay_lesbian_characters
[[2 0 0]
 [2 0 0]
 [2 0 0]]
0.3333333333333333

Average
0.2916666666666667

Overall
[[363   0   0   0]
 [ 30   0   0   0]
 [216   0   0   0]
 [187   0   0   0]]
0.45603015075376885


### Bag-of-words (count-based)

In [15]:
def identity(v):
    return v

In [16]:
max_words = shared_parameters.TEXT_MAX_WORDS
vectorizer = TfidfVectorizer(
    preprocessor=identity,
    tokenizer=identity,
    analyzer='word',
    token_pattern=None,
    max_features=max_words,
    norm='l2',
    sublinear_tf=True)
text_tokens = []
for source_tokens in text_source_tokens:
    all_tokens = []
    for tokens in source_tokens:
        all_tokens.extend(tokens)
    text_tokens.append(all_tokens)
X_w = vectorizer.fit_transform(text_tokens)
len(vectorizer.get_feature_names())

8192

In [17]:
test_size = shared_parameters.EVAL_TEST_SIZE  # b
test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
Y_T = Y.transpose()  # (n, c)
X_w_train, _, Y_train_T, _ = train_test_split(X_w, Y_T, test_size=test_size, random_state=test_random_state)
Y_train = Y_train_T.transpose()  # (c, n * (1 - b))

Predict.

In [20]:
def predict_baselines(create_model, P_w_predict, category_indices=None):
    print()
    print(create_model.__name__[7:])

    # Fit models.
    category_classifiers = []
    for j, category in enumerate(categories):
        y_train = Y_train[j]
        k = len(category_levels[j])
        classifiers = baselines.fit_ordinal(create_model, X_w_train, y_train, k)
        category_classifiers.append(classifiers)

    # Predict.
    Q_w_pred = []
    for j, classifiers in enumerate(category_classifiers):
        k = len(category_levels[j])
        if category_indices is not None:
            q_w_pred = baselines.predict_ordinal(classifiers, P_w_predict[category_indices[j]], k)
        else:
            q_w_pred = baselines.predict_ordinal(classifiers, P_w_predict, k)
        Q_w_pred.append(q_w_pred)

    # Evaluate.
    category_metrics = []
    for j, category in enumerate(categories):
        print()
        print(category)
        q_true = Q_true[j]
        if category_indices is not None:
            q_true = q_true[category_indices[j]]
        q_w_pred = Q_w_pred[j]
        confusion, metrics = evaluation.get_confusion_and_metrics(q_true, q_w_pred)
        print(confusion)
        print(metrics[0])
        category_metrics.append(metrics)

    # Average.
    print()
    print('Average')
    metrics_avg = [sum([metrics[i] for metrics in category_metrics])/len(category_metrics)
                   for i in range(len(category_metrics))]
    print(metrics_avg[0])

    # Overall.
    if category_indices is not None:
        return
    print()
    print('Overall')
    q_w_pred_overall = bookcave.get_overall_y(Q_w_pred)
    confusion_overall, metrics_overall = evaluation.get_confusion_and_metrics(q_true_overall, q_w_pred_overall)
    print(confusion_overall)
    print(metrics_overall[0])

In [None]:
create_models = [
    baselines.create_k_nearest_neighbors,
    baselines.create_linear_regression,
    baselines.create_logistic_regression,
    baselines.create_multinomial_naive_bayes,
    baselines.create_random_forest,
    baselines.create_svm]
P_w_predict = vectorizer.transform(predict_tokens)
for create_model in create_models:
    predict_baselines(create_model, P_w_predict)
    print()
    print('Balanced')
    predict_baselines(create_model, P_w_predict, category_indices=category_balanced_indices)


k_nearest_neighbors

crude_humor_language
[[365  42   7   0]
 [  4   0   0   0]
 [216  19  17   4]
 [ 95   8  16   3]]
0.4836683417085427

drug_alcohol_tobacco_use
[[737  29   8   0]
 [ 14   1   0   0]
 [  5   0   0   0]
 [  2   0   0   0]]
0.9271356783919598

kissing
[[521 208]
 [ 44  23]]
0.6834170854271356

profanity
[[231 125  46   0]
 [  4   2   0   0]
 [166  85  38   5]
 [ 55  31   5   3]]
0.3442211055276382

nudity
[[607  31  10   3]
 [  4   0   0   0]
 [ 16   0   1   1]
 [115   1   6   1]]
0.7650753768844221

sex_and_intimacy
[[500  26  12  27]
 [ 24   1   0   1]
 [ 47   6   1   3]
 [111  18  11   8]]
0.6407035175879398

violence_and_horror
[[572 133  18   0]
 [ 31  13   3   0]
 [ 14   6   2   0]
 [  3   1   0   0]]
0.7374371859296482

gay_lesbian_characters
[[775   1   2]
 [  2   0   0]
 [ 11   0   5]]
0.9798994974874372

Average
0.6951947236180905

Overall
[[157 163  25  18]
 [ 14  14   2   0]
 [ 90  82  33  11]
 [ 67  88  22  10]]
0.26884422110552764

Balanced

k_nearest_ne

### MLP

In [None]:
mlp_path = os.path.join(folders.MODELS_PATH, 'multi_layer_perceptron', 'ordinal', '32662682.h5')
predict_paragraphs.load_model_and_evaluate(mlp_path,
                                           P_w_predict,
                                           Q_true,
                                           categories)
predict_paragraphs.load_model_and_evaluate(mlp_path,
                                           P_w_predict,
                                           Q_true,
                                           categories,
                                           category_indices=category_balanced_indices)