# predict_paragraphs

In [None]:
import os

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from classification import baselines, evaluation, ordinal, shared_parameters
import folders
from sites.bookcave import bookcave

## Load paragraphs

In [None]:
source = 'paragraph_tokens'
subset_ratio = shared_parameters.DATA_SUBSET_RATIO
subset_seed = shared_parameters.DATA_SUBSET_SEED
min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
min_tokens = shared_parameters.DATA_MIN_TOKENS
categories_mode = shared_parameters.DATA_CATEGORIES_MODE
inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df = \
    bookcave.get_data({source},
                      subset_ratio=subset_ratio,
                      subset_seed=subset_seed,
                      min_len=min_len,
                      max_len=max_len,
                      min_tokens=min_tokens,
                      categories_mode=categories_mode,
                      return_meta=True)
text_source_tokens = list(zip(*inputs[source]))[0]
len(text_source_tokens)

## Paragraph labels

In [None]:
predict_locations = []
predict_tokens = []
predict_source_labels = []
for text_i, source_tokens in enumerate(text_source_tokens):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    category_labels = [bookcave.get_labels(asin, category) for category in categories]
    if any(labels is None for labels in category_labels):
        continue
    for source_i, tokens in enumerate(source_tokens):
        source_labels = [labels[source_i] for labels in category_labels]
        if any(label == -1 for label in source_labels):
            continue
        predict_locations.append((text_i, source_i))
        predict_tokens.append(tokens)
        predict_source_labels.append(source_labels)

In [None]:
Q_true = np.array(predict_source_labels).transpose()
Q_true.shape

## Word Vectors (Embedding)

Fit tokenizer.

In [None]:
max_words = shared_parameters.TEXT_MAX_WORDS
split = '\t'
tokenizer = Tokenizer(num_words=max_words, split=split)
all_locations = []
all_sources = []
for text_i, source_tokens in enumerate(text_source_tokens):
    for source_i, tokens in enumerate(source_tokens):
        all_locations.append((text_i, source_i))
        all_sources.append(split.join(tokens))
tokenizer.fit_on_texts(all_sources)

In [None]:
n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS
padding = shared_parameters.TEXT_PADDING
truncating = shared_parameters.TEXT_TRUNCATING


def get_input_sequence(source_tokens, tokenizer, n_tokens, padding='pre', truncating='pre'):
    return np.array(pad_sequences(tokenizer.texts_to_sequences([split.join(tokens) for tokens in source_tokens]),
                                  maxlen=n_tokens,
                                  padding=padding,
                                  truncating=truncating))

In [None]:
model_path = os.path.join(folders.MODELS_PATH, 'paragraph_cnn', 'ordinal', '32648156_glove300-emb.h5')
model = load_model(model_path)
model.summary()

Predict.

In [None]:
P_predict = np.array([get_input_sequence([source_tokens], tokenizer, n_tokens, padding, truncating)
                      for source_tokens in predict_tokens])
Q_pred_ordinal = model.predict(P_predict)
Q_pred = [ordinal.from_multi_hot_ordinal(q, threshold=.5) for q in Q_pred_ordinal]
len(Q_pred), len(Q_pred[0])

In [None]:
for j, category in enumerate(categories):
    print()
    print(category)
    q_true = Q_true[j]
    q_pred = Q_pred[j]
    confusion, metrics = evaluation.get_confusion_and_metrics(q_true, q_pred)
    print(confusion)
    print(metrics[0])

## Bag-of-words (count-based)

In [None]:
def identity(v):
    return v

In [None]:
vectorizer = TfidfVectorizer(
    preprocessor=identity,
    tokenizer=identity,
    analyzer='word',
    token_pattern=None,
    max_features=max_words,
    norm='l2',
    sublinear_tf=True)
text_tokens = []
for source_tokens in text_source_tokens:
    all_tokens = []
    for tokens in source_tokens:
        all_tokens.extend(tokens)
    text_tokens.append(all_tokens)
X_w = vectorizer.fit_transform(text_tokens)
len(vectorizer.get_feature_names())

In [None]:
test_size = shared_parameters.EVAL_TEST_SIZE  # b
test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE
Y_T = Y.transpose()  # (n, c)
X_w_train, X_w_test, Y_train_T, Y_test_T = train_test_split(X_w, Y_T, test_size=test_size, random_state=test_random_state)
Y_train = Y_train_T.transpose()  # (c, n * (1 - b))
Y_test = Y_test_T.transpose()  # (c, n * b)

In [None]:
category_classifiers = []
for j, category in enumerate(categories):
    y_train = Y_train[j]
    k = len(category_levels[j])
    classifiers = baselines.fit_ordinal(baselines.create_svm, X_w_train, y_train, k)
    category_classifiers.append(classifiers)

Predict.

In [None]:
P_w_predict = vectorizer.transform(predict_tokens)
Q_w_pred = []
for j, classifiers in enumerate(category_classifiers):
    p_w_predict = P_w_predict[j]
    k = len(category_levels[j])
    q_w_pred = baselines.predict_ordinal(classifiers, p_w_predict, k)
    Q_w_pred.append(q_w_pred)
len(Q_w_pred), len(Q_w_pred[0])

In [None]:
for j, category in enumerate(categories):
    print()
    print(category)
    q_true = Q_true[j]
    q_w_pred = Q_w_pred[j]
    confusion, metrics = evaluation.get_confusion_and_metrics(q_true, q_w_pred)
    print(confusion)
    print(metrics[0])