# paragraph_rnn

In [None]:
import os
import keras
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import tensorflow as tf

from classification import ordinal, paragraph_rnn
from sites.bookcave import bookcave

In [None]:
print('TensorFlow version: {}'.format(tf.__version__))

In [None]:
MODELS_PATH = os.path.join('models')
GLOVE_100_PATH = os.path.join('..', '..', 'embeddings', 'glove.6B.100d.txt')

Load data.

In [None]:
text_min_len = 250
text_max_len = 7500

In [None]:
token_inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df =\
    bookcave.get_data({'text'},
                      text_source='tokens',
                      text_min_len=text_min_len,
                      text_max_len=text_max_len,
                      return_meta=True)
text_paragraph_tokens = [paragraph_tokens for paragraph_tokens, _ in token_inputs['text']]

Load embedding matrix.

In [None]:
max_words = 4096

In [None]:
all_locations = []
all_tokens = []
for text_i, paragraph_tokens in enumerate(text_paragraph_tokens):
    for paragraph_i, tokens in enumerate(paragraph_tokens):
        all_locations.append((text_i, paragraph_i))
        all_tokens.append(tokens)
len(all_locations)

In [None]:
tokenizer = Tokenizer(num_words=max_words, oov_token='__UNKNOWN__')
tokenizer.fit_on_texts(all_tokens)

In [None]:
embed_size, embedding_matrix = paragraph_rnn.get_embedding(tokenizer, GLOVE_100_PATH, max_words)
embedding_matrix.shape

Load labels and flatten.

In [None]:
tokens_min_len = 3

In [None]:
train_locations = []
train_tokens = []
train_paragraph_labels = []
for text_i, paragraph_tokens in enumerate(text_paragraph_tokens):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    category_labels = [bookcave.get_labels(asin, category) for category in categories]
    if any(labels is None for labels in category_labels):
        continue
    for paragraph_i, tokens in enumerate(paragraph_tokens):
        paragraph_labels = [labels[paragraph_i] for labels in category_labels]
        if any(label == -1 for label in paragraph_labels):
            continue
        if len(tokens) < tokens_min_len:
            continue
        train_locations.append((text_i, paragraph_i))
        train_tokens.append(tokens)
        train_paragraph_labels.append(paragraph_labels)
train_locations = np.array(train_locations)
train_paragraph_labels = np.array(train_paragraph_labels)
len(train_locations)

Split data.

In [None]:
n_tokens = 160
test_size = .25
random_state = 1

In [None]:
def get_input_array(sequence):
    x = np.zeros((n_tokens,), dtype=np.int32)
    if len(sequence) > n_tokens:
        # Truncate center.
        x[:n_tokens//2] = sequence[:n_tokens//2]
        x[-n_tokens//2:] = sequence[-n_tokens//2:]
    else:
        # Pad beginning ('pre').
        x[-len(sequence):] = sequence
    return x

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_tokens)
P = np.array([get_input_array(sequence) for sequence in train_sequences])
P_train, P_test, paragraph_labels_train, paragraph_labels_test =\
    train_test_split(P,
                     train_paragraph_labels,
                     test_size=test_size,
                     random_state=random_state)
Q_train, Q_test = paragraph_labels_train.transpose(), paragraph_labels_test.transpose()
Q_train.shape, Q_test.shape

Create new models.

In [None]:
hidden_size = 64
dense_size = 32
train_emb = True

In [None]:
models = []
weights_fnames = []
for category_i, category in enumerate(categories):
    n_classes = len(category_levels[category_i])
    model, weights_fname = paragraph_rnn.create_model(category,
                                                      n_classes,
                                                      n_tokens,
                                                      embedding_matrix,
                                                      hidden_size,
                                                      dense_size,
                                                      train_emb=train_emb)
    models.append(model)
    weights_fnames.append(weights_fname)

In [None]:
epochs = 8
batch_size = 32
validation_split = .25

In [None]:
for category_i, category in enumerate(categories):
    print()
    print(category)
    n_classes = len(category_levels[category_i])
    model = models[category_i]
    q_train = Q_train[category_i]
    q_train_ordinal = ordinal.to_multi_hot_ordinal(q_train, num_classes=n_classes)
    
    optimizer = Adam()
    model.compile(optimizer,
                  loss='binary_crossentropy',
                  metrics=['binary_accuracy', 'categorical_accuracy'])
    _ = model.fit(P_train,
                  q_train_ordinal,
                  epochs=epochs,
                  batch_size=batch_size,
                  validation_split=validation_split)
    
    q_pred_ordinal = model.predict(P_test)
    q_pred = ordinal.from_multi_hot_ordinal(q_pred_ordinal)
    q_test = Q_test[category_i]
    print('Accuracy: {:.4%}'.format(accuracy_score(q_test, q_pred)))
    confusion = confusion_matrix(q_test, q_pred)
    print(confusion)

## Paragraphs

In [None]:
def get_label_from_paragraph_labels(q_pred):
    return max(q_pred)

In [None]:
def predict_book_labels(X, locations, Y, verbose=0):
    Y_pred = np.zeros(Y.shape, dtype=np.int32)
    for category_i in range(len(Y)):
        if verbose:
            print('Predicting labels for category {}...'.format(categories[category_i]))
        model = models[category_i]
        x = X[category_i]
        q_pred_ordinal = model.predict(X)
        q_pred = ordinal.from_multi_hot_ordinal(q_pred_ordinal)
        if verbose:
            print('Done.')

        # Calculate label for each text.
        if verbose:
            print('Calculating book labels...')
        text_i = locations[0][0]
        text_pred = []
        for i, pred in enumerate(q_pred):
            location = locations[i]
            if location[0] != text_i:
                label = get_label_from_paragraph_labels(text_pred)
                Y_pred[category_i, text_i] = label
                text_i = location[0]
                text_pred = []
            text_pred.append(pred)
        label = get_label_from_paragraph_labels(text_pred)
        Y_pred[category_i, -1] = label
        if verbose:
            print('Done.')
    return Y_pred

In [None]:
def print_results(Y, Y_pred):
    for category_i in range(len(Y)):
        print()
        print(categories[category_i])
        y_test, y_pred = Y[category_i], Y_pred[category_i]
        print('Accuracy: {:.4%}'.format(accuracy_score(y_test, y_pred)))
        confusion = confusion_matrix(y_test, y_pred)
        print(confusion)

Train on small subset of books.

In [None]:
test_text_indices = {text_i for text_i, _ in train_locations}
test_locations = []
test_tokens = []
Y_test = Y[:, list(test_text_indices)]
for i, text_i in enumerate(test_text_indices):
    for paragraph_i, tokens in enumerate(text_paragraph_tokens[text_i]):
        test_locations.append((i, paragraph_i))
        test_tokens.append(tokens)
test_sequences = tokenizer.texts_to_sequences(test_tokens)
X_test = np.array([get_input_array(sequence) for sequence in test_sequences])
Y_pred_test = predict_book_labels(X_test, test_locations, Y_test)

In [None]:
print_results(Y_test, Y_pred_test)

Predict book ratings for all books.

In [None]:
all_sequences = tokenizer.texts_to_sequences(all_tokens)
X_all = np.array([get_input_array(sequence) for sequence in all_sequences])
Y_pred_all = predict_book_labels(X_all, all_locations, Y)

In [None]:
print_results(Y, Y_pred_all)