# label_texts

In [None]:
import os
import numpy as np
import keras
import IPython
import ipywidgets as widgets

from classification import ordinal, paragraph_rnn
import folders
from sites.bookcave import bookcave
from text import paragraph_io

In [None]:
MODELS_PATH = os.path.join('models')
GLOVE_100_PATH = os.path.join('..', '..', 'embeddings', 'glove.6B.100d.txt')

## Data

In [None]:
token_inputs, Y, categories, levels, book_ids, books_df, _, _, categories_df =\
    bookcave.get_data({'text'},
                      text_source='tokens',
                      only_categories={bookcave.CATEGORY_INDEX_DRUG_ALCOHOL_TOBACCO_USE,
                                       bookcave.CATEGORY_INDEX_SEX_AND_INTIMACY,
                                       bookcave.CATEGORY_INDEX_VIOLENCE_AND_HORROR},
                      return_meta=True)
token_texts = token_inputs['text']
len(token_texts)

In [None]:
first_category_rows = categories_df[categories_df['category'] == categories[0]]
rating_names = [first_category_rows.iloc[i]['rating'] for i in range(len(first_category_rows))]
category_descriptions = [list(categories_df[categories_df['category'] == category]['description']) for category in categories]

In [None]:
paragraph_inputs, _, _, _,\
paragraph_book_ids, _, _, _, _ =\
bookcave.get_data({'text'},
                  text_source='paragraphs',
                  return_meta=True)
paragraph_texts = [text for i, text in enumerate(paragraph_inputs['text']) if paragraph_book_ids[i] in book_id_to_index.keys()]
len(paragraph_texts)

In [None]:
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

In [None]:
train_book_ids = ['torture-mom', 'devitt', 'secrets-5', 'circus-of-horror']
train_tokens = []
train_paragraphs = []
for book_id in train_book_ids:
    book_index = book_id_to_index[book_id]
    train_tokens.append(token_texts[book_index])
    train_paragraphs.append(paragraph_texts[book_index])

In [None]:
# Dimensions: [text_i], [section_i], [paragraph_i], [token_i] = str
train_tokens[0][0][0][:10]

In [None]:
# Dimensions: [text_i], (sections=0, paragraphs=1), [section_i or section_paragraphs_i], [paragraph_i] = str
train_paragraphs[0][1][0][0]

## Model

Set hyper-parameters.

In [None]:
max_words = 40000

# Less than 3% of paragraphs contain more than 120 tokens.
n_tokens = 160

Tokenize the text.

In [None]:
all_token_paragraphs = []
for section_paragraphs_tokens in token_texts:
    for paragraphs_tokens in section_paragraphs_tokens:
        for tokens in paragraphs_tokens:
            all_token_paragraphs.append(tokens)
all_token_paragraphs[0][:10]

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(all_token_paragraphs)
len(tokenizer.word_index)

Load word embedding.

In [None]:
embed_size, embedding_matrix = paragraph_rnn.get_embedding(tokenizer, GLOVE_100_PATH, max_words)
embedding_matrix.shape

Create model.

In [None]:
hidden_size = 128
dense_size = 64
train_emb = True

models = []
model_weights_fnames = []
for category_index, category_levels in enumerate(levels):
    category = categories[category_index]
    n_classes = len(category_levels)
    model, weights_fname = paragraph_rnn.create_model(category,
                                                      n_classes,
                                                      n_tokens,
                                                      embedding_matrix,
                                                      hidden_size,
                                                      dense_size,
                                                      train_emb=train_emb)
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    
    path = os.path.join(MODELS_PATH, weights_fname)
    if os.path.exists(path):
        model.load_weights(path)

    models.append(model)

In [None]:
def get_input_array(sequence):
    x = np.zeros((n_tokens,), dtype=np.int32)
    if len(sequence) > n_tokens:
        # Truncate center.
        x[:n_tokens//2] = sequence[:n_tokens//2]
        x[-n_tokens//2:] = sequence[-n_tokens//2:]
    else:
        # Pad beginning ('pre').
        x[-len(sequence):] = sequence
    return x


def get_predictions(x):
    y_preds_ordinal = [model.predict([[x]], batch_size=1) for model in models]
    y_preds = [ordinal.from_multi_hot_ordinal(y_pred_ordinal) for y_pred_ordinal in y_preds_ordinal]
    return x, y_preds

## Label

Instantiate text (book), section, and paragraph indices.

In [None]:
text_i = 0
section_i = 0
paragraph_i = 0

In [None]:
def move_to_next_paragraph():
    global text_i
    global section_i
    global paragraph_i
    
    paragraph_i += 1
    while paragraph_i == len(train_paragraphs[text_i][1][section_i]):
        paragraph_i = 0
        section_i += 1
        if section_i == len(train_paragraphs[text_i][1]):
            section_i = 0
            text_i += 1
            if text_i == len(train_paragraphs):
                text_i = 0
                return True
    
    return False

Create widgets.

In [None]:
paragraph_html = widgets.HTML(value='')

category_toggle_buttons = []
for category_index, category_levels in enumerate(levels):
    level_descriptions = category_descriptions[category_index]
    options = []
    description = categories[category_index]
    tooltips = []
    for level_index, level in enumerate(category_levels):
        options.append((rating_names[level_index], level_index))
        split_levels = '\n'.join(level.split('|'))
        level_description = level_descriptions[level_index]
        split_level_descriptions = '\n'.join(level_description.split('|'))
        tooltips.append('{}\n\n{}'.format(split_levels, split_level_descriptions))
    toggle_buttons = widgets.ToggleButtons(
        options=options,
        description=description,
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltips=tooltips
    )
    category_toggle_buttons.append(toggle_buttons)


def on_submit_button_clicked(button):
    # Collect the answers.
    for category_index, category_levels in enumerate(levels):
        y = category_toggle_buttons[category_index].value
        if do_train_inline:
            # Fit the models to these answers.
            y_ordinal = ordinal.to_multi_hot_ordinal([y], num_classes=len(category_levels))
            history = models[category_index].fit([[x_train]], y_ordinal, batch_size=1, epochs=1, verbose=0)
        train_labels[text_i][category_index][section_i][paragraph_i] = y

    has_ended = move_to_next_paragraph()
    if not do_overwrite:
        while not has_ended and train_labels[text_i][category_index][section_i][paragraph_i] == -1:
            has_ended = move_to_next_paragraph()

    if not has_ended:
        display_paragraph_interface()
    else:
        print('Finished training on {:d} books.'.format(len(train_paragraphs)))


submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit the above values as the categorical maturity rating levels for this paragraph.\nThen move on to the next paragraph.',
    icon='check'
)
submit_button.on_click(on_submit_button_clicked)

In [None]:
def display_paragraph_interface():
    global x_train
    
    # Calculate meta data.
    book_id = train_book_ids[text_i]
    book_index = book_id_to_index[book_id]
    book_title = books_df.iloc[book_index]['title']
    book_y = Y[:, book_index]
    
    section = train_paragraphs[text_i][0][section_i]
    paragraph = train_paragraphs[text_i][1][section_i][paragraph_i]
    
    # Clear any previous output in this cell.
    IPython.display.clear_output(wait=True)
    
    # Print meta data.
    c_width = 120
    print('-'*c_width)
    print('{} (book {:d} of {:d})'.format(book_title, text_i + 1, len(train_book_ids)))
    print()
    print('Actual categorical rating levels:')
    for category_index, level_index in enumerate(book_y):
        category = categories[category_index]
        level = levels[category_index][level_index]
        print('  {:28}: {} ({})'.format(category, rating_names[level_index], level))
    print('-'*c_width)
    print()
    print('What are the categorical maturity rating levels for this paragraph?')
    
    # Populate the input to the models.
    if do_predict or do_train_inline:
        train_sequence = tokenizer.texts_to_sequences([train_tokens[text_i][section_i][paragraph_i]])[0]
        x_train = get_input_array(train_sequence)
    
    # Collect predictions from models.
    if do_train_inline:
        y_preds = get_predictions(x_train)
        for category_index, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = y_preds[category_index][0]
    else:
        for toggle_buttons in category_toggle_buttons:
            toggle_buttons.value = 0
    
    # Display toggle buttons.
    for toggle_buttons in category_toggle_buttons:
        IPython.display.display(toggle_buttons)
    
    # Display button.
    IPython.display.display(submit_button)
    
    print()
    print('{} (section {:d} of {:d})'.format(section, section_i + 1, len(train_paragraphs[text_i][0])))
    print()
    print('(paragraph {:d} of {:d})'.format(paragraph_i + 1, len(train_paragraphs[text_i][1][section_i])))
    print('='*c_width)
    paragraph_html.value = '<p style="font-size:large;margin-left:8em;max-width:36em;">{}</p>'.format(paragraph)
    IPython.display.display(paragraph_html)
    print('='*c_width)

Reset training labels.

In [None]:
train_labels = [[[[-1 for _ in range(len(train_paragraphs[text_i][1][section_i]))]
                  for section_i in range(len(train_paragraphs[text_i][1]))]
                 for _ in range(len(categories))]
                for text_i in range(len(train_paragraphs))]

Start labeling.

In [None]:
do_predict = False
do_train_inline = False
do_overwrite = False

In [None]:
display_paragraph_interface()

In [None]:
train_labels[0][0][0][:5]

Save training labels.

In [None]:
for text_i in range(len(train_labels)):
    book_id = train_book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    
    for category_index, category in enumerate(categories):
        section_paragraph_labels = train_labels[text_i][category_index]
        fname = folders.FNAME_TEXT_PARAGRAPHS_LABELS_FORMAT.format(category)
        path = os.path.join(folders.AMAZON_KINDLE_TEXT_PATH, asin, fname)
        paragraph_io.write_formatted_section_paragraph_labels(section_paragraph_labels, path)

Save model weights.

In [None]:
for category_index, model in enumerate(models):
    fname = model_fnames[category_index]
    path = os.path.join(MODELS_PATH, fname)
    model.save_weights(path)