In [None]:
%cd ../..

# window

In [None]:
import os
import pickle

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from python import folders
from python.classifiers import baselines as base
from python.sites.bookcave import bookcave
from python.text import tokenizers
from python.util import ordinal, shared_parameters

In [None]:
tf.__version__

In [None]:
asins = set(os.listdir(folders.AMAZON_KINDLE_LABELS_PATH))
all_books_df = pd.read_csv(folders.CONTENT_BOOKCAVE_BOOKS_CSV_PATH, encoding='utf-8')
labeled_books_df = all_books_df[all_books_df['asin'].isin(asins)]
len(labeled_books_df)

In [None]:
labeled_books_df['id'].values

In [None]:
source_mode = 'paragraph'
source = 'paragraph_tokens'
subset_ratio = shared_parameters.DATA_SUBSET_RATIO
subset_seed = shared_parameters.DATA_SUBSET_SEED
min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN
max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN
min_tokens = shared_parameters.DATA_MIN_TOKENS
remove_stopwords = False
categories_mode = shared_parameters.DATA_CATEGORIES_MODE
return_overall = shared_parameters.DATA_RETURN_OVERALL
inputs, Y, categories, category_levels, book_ids, books_df, _, _, _ = \
    bookcave.get_data({source},
                      only_ids=set(labeled_books_df['id'].values),
                      subset_ratio=subset_ratio,
                      subset_seed=subset_seed,
                      min_len=min_len,
                      max_len=max_len,
                      min_tokens=min_tokens,
                      remove_stopwords=remove_stopwords,
                      categories_mode=categories_mode,
                      return_overall=return_overall,
                      return_meta=True)
text_source_tokens = list(zip(*inputs[source]))[0]
len(text_source_tokens)

In [None]:
predict_locations = []
predict_tokens = []
predict_source_labels = []
for text_i, source_tokens in enumerate(text_source_tokens):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    category_labels = [bookcave.get_labels(asin, category)
                       for category in categories[:bookcave.CATEGORY_INDEX_OVERALL]]
    if any(labels is None for labels in category_labels):
        continue
    for source_i, tokens in enumerate(source_tokens):
        source_labels = [labels[source_i] for labels in category_labels]
        if any(label == -1 for label in source_labels):
            continue
        predict_locations.append((text_i, source_i))
        predict_tokens.append(tokens)
        predict_source_labels.append(source_labels)
Q_true = np.zeros((len(categories), len(predict_source_labels)), dtype=np.int32)
for i, source_labels in enumerate(predict_source_labels):
    for j, label in enumerate(source_labels):
        Q_true[j, i] = label
if return_overall:
    Q_true[bookcave.CATEGORY_INDEX_OVERALL] = \
        bookcave.get_y_overall(Q_true, categories_mode=categories_mode)
Q_true.shape

In [None]:
len(text_source_tokens[43])

In [None]:
max_words = shared_parameters.TEXT_MAX_WORDS
tokenizer = tokenizers.get_tokenizer_or_fit(max_words,
                                            source_mode,
                                            remove_stopwords)
vectorizer = tokenizers.get_vectorizer_or_fit(max_words,
                                              remove_stopwords)

In [None]:
#     crude_humor_language | 0.6130  | 0.4623  | 0.4525  | 0.4416  | 0.6938  | 0.6130  | 0.6386  | 0.4386  | 0.5418  |
# drug_alcohol_tobacco_use | 0.5254  | 0.3789  | 0.3565  | 0.3424  | 0.6236  | 0.5254  | 0.5276  | 0.4988  | 0.5489  |
#                  kissing | 0.7803  | 0.7447  | 0.7953  | 0.7540  | 0.8249  | 0.7803  | 0.7904  | 0.2197  | 0.2197  |
#                profanity | 0.6302  | 0.5871  | 0.6152  | 0.5891  | 0.6746  | 0.6302  | 0.6427  | 0.3987  | 0.4582  |
#                   nudity | 0.6708  | 0.4492  | 0.5029  | 0.4631  | 0.7159  | 0.6708  | 0.6787  | 0.3448  | 0.3761  |
#         sex_and_intimacy | 0.5059  | 0.5575  | 0.5324  | 0.5281  | 0.5736  | 0.5059  | 0.5214  | 0.5512  | 0.6732  |
#      violence_and_horror | 0.5762  | 0.5391  | 0.5585  | 0.5448  | 0.5858  | 0.5762  | 0.5796  | 0.4543  | 0.5152  |
#                  overall | 0.6317  | 0.6089  | 0.6342  | 0.6164  | 0.6356  | 0.6317  | 0.6267  | 0.3870  | 0.4277  |
category_net_paths = [
    'models/paragraph_max_ordinal/35082769_0.h5',  # remove 3
    'models/paragraph_max_ordinal/35082760_trainemb_1.h5',  # remove 3
    'models/paragraph_max_ordinal/35082771_2.h5',
    'models/paragraph_max_ordinal/35082762_trainemb_3.h5',
    'models/paragraph_max_ordinal/35082763_trainemb_4.h5',  # remove 3
    'models/paragraph_max_ordinal/35082764_trainemb_5.h5',
    'models/paragraph_max_ordinal/35082765_trainemb_6.h5',
    'models/paragraph_max_ordinal/35082776_7.h5'
]
category_nets = [tf.keras.models.load_model(path) for path in category_net_paths]

In [None]:
baselines = [
    'k_nearest_neighbors',
    'logistic_regression',
    'multi_layer_perceptron',
    'multinomial_naive_bayes',
    'random_forest',
    'svm'
]
baseline_category_models = list()
for baseline in baselines:
    category_models = list()
    for j, levels in enumerate(category_levels):
        models = list()
        category_part = '36100418_{:d}'.format(j)
        for k in range(len(levels) - 1):
            path = os.path.join(folders.MODELS_PATH, baseline, category_part, 'model{:d}.pickle'.format(k))
            with open(path, 'rb') as fd:
                model = pickle.load(fd)
            models.append(model)
        category_models.append(models)
    baseline_category_models.append(category_models)

In [None]:
split = '\t'
padding = shared_parameters.TEXT_PADDING
truncating = shared_parameters.TEXT_TRUNCATING
X = [np.array(pad_sequences(tokenizer.texts_to_sequences([split.join(tokens) for tokens in source_tokens]),
                            maxlen=shared_parameters.TEXT_N_PARAGRAPH_TOKENS,
                            padding=padding,
                            truncating=truncating))
     for source_tokens in text_source_tokens]

In [None]:
def get_P(x, window=1):
    P = np.zeros((len(x) - window + 1, window, *x.shape[1:]))
    for i in range(len(P)):
        P[i] = x[i:i+window]
    return P


def get_P_b(source_tokens, vectorizer, window=1):
    token_windows = list()
    for i in range(len(source_tokens) - window + 1):
        token_window = list()
        for tokens in source_tokens[i:i+window]:
            token_window.extend(tokens)
        token_windows.append(token_window)
    return vectorizer.transform(token_windows)

In [None]:
y = Y[5]
net = category_nets[5]
x = X[0]
P = get_P(x, window=1)
q_pred_transform = net.predict(P)
q_pred_transform.shape

In [None]:
np.max(q_pred_transform, axis=0)

In [None]:
window = 1

In [None]:
for j, y in enumerate(Y):
    net = category_nets[j]
    for i in range(len(y)):
        x = X[i]
        P = get_P(x, window=window)
        q_pred_transform = net.predict(P)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5)
        label_pred = max(q_pred)
        label_pred_hat = ordinal.from_multi_hot_ordinal([np.max(q_pred_transform, axis=0)])[0]
        print('j={:d} i={:d} label={:d} label_pred={:d} label_pred_hat={:d}'
              .format(j, i, y[i], label_pred, label_pred_hat))
        
    
#     source_tokens = text_source_tokens[i]
#     P_b = get_P_b(source_tokens, vectorizer, window=window)

In [None]:
y = Y[5]
models = baseline_category_models[5][5]
source_tokens = text_source_tokens[0]
P_b = get_P_b(source_tokens, vectorizer, window=1)
q_pred = base.predict_ordinal(models, P_b, len(category_levels[5]))
q_pred.shape

In [None]:
q_pred[:10]

In [None]:
max(q_pred)

In [None]:
for m, category_models in enumerate(baseline_category_models):
    for j, y in enumerate(Y):
        k = len(category_levels[j])
        models = category_models[j]
        for i in range(len(y)):
            source_tokens = text_source_tokens[i]
            P_b = get_P_b(source_tokens, vectorizer, window=window)
            q_pred = base.predict_ordinal(models, P_b, k)
            label_pred = max(q_pred)
            print('m={:d} j={:d} i={:d} label={:d} label_pred={:d}'
                  .format(m, j, i, y[i], label_pred))