# label_texts

In [None]:
import os
import numpy as np
import keras
import IPython
import ipywidgets as widgets

from classification import ordinal, paragraph_rnn
import folders
from sites.bookcave import bookcave
from text import paragraph_io

In [None]:
MODELS_PATH = os.path.join('models')
GLOVE_100_PATH = os.path.join('..', '..', 'embeddings', 'glove.6B.100d.txt')

## Data

In [None]:
text_min_len = 250
only_categories = None
if only_categories:
    category_names = [bookcave.CATEGORY_NAMES[category_i] for category_i in only_categories]
else:
    category_names = bookcave.CATEGORY_NAMES

### Paragraphs

In [None]:
paragraph_inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df =\
    bookcave.get_data({'text'},
                      text_source='paragraphs',
                      text_min_len=text_min_len,
                      only_categories=only_categories,
                      return_meta=True)
paragraph_texts = paragraph_inputs['text']
text_paragraphs, text_section_ids, text_sections = [], [], []
for paragraphs, section_ids, sections in paragraph_texts:
    text_paragraphs.append(paragraphs)
    text_section_ids.append(section_ids)
    text_sections.append(sections)
text_paragraphs[0][0], text_sections[0][text_section_ids[0][0]]
len(text_paragraphs)

In [None]:
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

In [None]:
first_category_rows = categories_df[categories_df['category'] == categories[0]]
rating_names = [first_category_rows.iloc[i]['rating'] for i in range(len(first_category_rows))]
rating_names

In [None]:
category_descriptions = [list(categories_df[categories_df['category'] == category]['description']) for category in categories]
category_descriptions[0]

### Labels

In [None]:
label_inputs, _, _, _, label_book_ids, _, _, _, _ = \
    bookcave.get_data({'text'},
                      text_source='labels',
                      text_min_len=text_min_len,
                      only_categories=only_categories,
                      return_meta=True)
text_category_labels = label_inputs['text']
len(text_category_labels)

In [None]:
label_book_id_to_index = {book_id: i for i, book_id in enumerate(label_book_ids)}

In [None]:
def create_category_labels(paragraphs):
    return [[-1]*len(paragraphs) for _ in range(len(categories))]

### Training Set

In [None]:
train_book_ids = ['borderline',
                  'ellies-encounter',
                  'cyborg-awakenings',
                  'pleasuring-lady-pennington',
                  'torture-mom',
                  'gettin-lucky',
                  'kismet',
                  'winter-thrillz',
                  'sprite-night',
                  'lustful-lies',
                  'skyline-the-dragon-commander',
                  'werecat-the-rearing',
                  'ember-of-war-2',
                  'diamond-hustle',
                  'ahrions-minions',
                  'sweet-melissa-destination-unknown',
                  'prayers-for-the-soul-of-a-dying-star',
                  'the-wall',
                  'come-away-with-me',
                  'theirs-for-the-night']
train_text_paragraphs = []
train_text_section_ids = []
train_text_sections = []
train_text_category_labels = []
for book_id in train_book_ids:
    i = book_id_to_index[book_id]
    train_text_paragraphs.append(text_paragraphs[i])
    train_text_section_ids.append(text_section_ids[i])
    train_text_sections.append(text_sections[i])
    if book_id in label_book_id_to_index.keys():
        j = label_book_id_to_index[book_id]
        train_text_category_labels.append(text_category_labels[j])
    else:
        train_text_category_labels.append(create_category_labels(text_paragraphs[i]))

### Usage

In [None]:
# Dimensions: [text_i], [paragraph_i] = str
train_text_paragraphs[0][0]

In [None]:
# Dimensions: [text_i], [paragraph_i] = int
train_text_section_ids[0][0]

In [None]:
# Dimensions: [text_i], [section_id] = str
train_text_sections[0][train_text_section_ids[0][0]]

In [None]:
# Dimensions: [text_i], [category_i], [paragraph_i] = int
train_text_category_labels[0][0][:10]

## Model

In [None]:
do_predict = False

**The rest of the cells in this section are only necessary if `do_predict == True`.**

### Tokens

In [None]:
token_inputs, _, _, _, token_book_ids, _, _, _, _ =\
    bookcave.get_data({'text'},
                      text_source='tokens',
                      text_min_len=text_min_len,
                      only_categories=only_categories,
                      return_meta=True)
token_texts = token_inputs['text']
text_paragraph_tokens = []
for paragraph_tokens, _ in token_texts:
    text_paragraph_tokens.append(paragraph_tokens)
len(text_paragraph_tokens)

In [None]:
token_book_id_to_index = {book_id: i for i, book_id in enumerate(token_book_ids)}

Set hyper-parameters.

In [None]:
max_words = 40000

# Less than 1% of paragraphs contain more than 160 tokens.
n_tokens = 160

Tokenize the text.

In [None]:
all_paragraph_tokens = []
for paragraph_tokens in text_paragraph_tokens:
    for tokens in paragraph_tokens:
        all_paragraph_tokens.append(tokens)
len(all_paragraph_tokens), all_paragraph_tokens[42]

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(all_paragraph_tokens)
len(tokenizer.word_index)

Load word embedding.

In [None]:
embed_size, embedding_matrix = paragraph_rnn.get_embedding(tokenizer, GLOVE_100_PATH, max_words)
embedding_matrix.shape

Create model.

In [None]:
hidden_size = 128
dense_size = 64
train_emb = True

models = []
model_weights_fnames = []
for category_i, levels in enumerate(category_levels):
    category = categories[category_i]
    n_classes = len(levels)
    model, weights_fname = paragraph_rnn.create_model(category,
                                                      n_classes,
                                                      n_tokens,
                                                      embedding_matrix,
                                                      hidden_size,
                                                      dense_size,
                                                      train_emb=train_emb)
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    
    path = os.path.join(MODELS_PATH, weights_fname)
    if os.path.exists(path):
        model.load_weights(path)

    models.append(model)

In [None]:
def get_input_array(sequence):
    x = np.zeros((n_tokens,), dtype=np.int32)
    if len(sequence) > n_tokens:
        # Truncate center.
        x[:n_tokens//2] = sequence[:n_tokens//2]
        x[-n_tokens//2:] = sequence[-n_tokens//2:]
    else:
        # Pad beginning ('pre').
        x[-len(sequence):] = sequence
    return x


def get_predictions(x):
    y_preds_ordinal = [model.predict([[x]], batch_size=1) for model in models]
    y_preds = [ordinal.from_multi_hot_ordinal(y_pred_ordinal) for y_pred_ordinal in y_preds_ordinal]
    return x, y_preds

## Label

In [None]:
do_overwrite = True

Instantiate text (book), and paragraph indices.

In [None]:
text_i = 2
paragraph_i = 84

In [None]:
def move_to_next_paragraph():
    global text_i
    global paragraph_i
    
    paragraph_i += 1
    while paragraph_i == len(train_text_paragraphs[text_i]):
        text_i += 1
        if text_i == len(train_text_paragraphs):
            return False
        paragraph_i = 0
    
    return True


def move_to_previous_paragraph():
    global text_i
    global paragraph_i

    paragraph_i -= 1
    while paragraph_i < 0:
        text_i -= 1
        if text_i < 0:
            return False
        paragraph_i = len(train_text_paragraphs[text_i]) - 1
    
    return True

Create widgets.

In [None]:
paragraph_html = widgets.HTML(value='')

category_toggle_buttons = []
for category_i, levels in enumerate(category_levels):
    level_descriptions = category_descriptions[category_i]
    options = []
    description = categories[category_i]
    tooltips = []
    for level_i, level in enumerate(levels):
        options.append((rating_names[level_i], level_i))
        split_levels = '\n'.join(level.split('|'))
        level_description = level_descriptions[level_i]
        split_level_descriptions = '\n'.join(level_description.split('|'))
        tooltips.append('{}\n\n{}'.format(split_levels, split_level_descriptions))
    toggle_buttons = widgets.ToggleButtons(
        options=options,
        description=description,
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltips=tooltips
    )
    category_toggle_buttons.append(toggle_buttons)


def on_submit_button_clicked(button):
    # Collect the answers.
    for category_i, levels in enumerate(category_levels):
        y = category_toggle_buttons[category_i].value
        train_text_category_labels[text_i][category_i][paragraph_i] = y

    moved = move_to_next_paragraph()
    if not do_overwrite:
        while moved and all([train_text_category_labels[text_i][category_i][paragraph_i] != -1
                             for category_i in range(len(categories))]):
            moved = move_to_next_paragraph()

    if moved:
        display_paragraph_interface()
    else:
        print('Finished training on {:d} books.'.format(len(train_paragraphs)))


submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit the above values as the categorical maturity rating levels for this paragraph.\nThen move on to the next paragraph.',
    icon='check'
)
submit_button.on_click(on_submit_button_clicked)


def on_back_button_clicked(button):
    moved_back = move_to_previous_paragraph()
    if moved_back:
        display_paragraph_interface()
    else:
        print('Cannot move to previous paragraph.')


back_button = widgets.Button(
    description='Back',
    disabled=False,
    button_style='warning',
    tooltip='Move to the previous paragraph.'
)
back_button.on_click(on_back_button_clicked)

In [None]:
def display_paragraph_interface():
    # Calculate meta data.
    book_id = train_book_ids[text_i]
    book_index = book_id_to_index[book_id]
    book_title = books_df.iloc[book_index]['title']
    book_authors = books_df.iloc[book_index]['authors']
    book_y = Y[:, book_index]
    
    section_i = train_text_section_ids[text_i][paragraph_i]
    sections = train_text_sections[text_i]
    section = sections[section_i]
    paragraph = train_text_paragraphs[text_i][paragraph_i]
    
    # Clear any previous output in this cell.
    IPython.display.clear_output(wait=True)
    
    # Print meta data.
    c_width = 114
    print('-'*c_width)
    print('{} (book {:d} of {:d})'.format(book_title, text_i + 1, len(train_book_ids)))
    print('{}'.format(book_authors))
    print()
    print('Actual categorical rating levels:')
    for category_i, level_i in enumerate(book_y):
        category = categories[category_i]
        level = category_levels[category_i][level_i]
        print('  {:28}: {} ({})'.format(category, rating_names[level_i], level))
    print('-'*c_width)
    print()
    print('What are the categorical maturity rating levels for this paragraph?')
    
    # Update toggle buttons from existing labels or predictions.
    if all([train_text_category_labels[text_i][category_i][paragraph_i] != -1 for category_i in range(len(categories))]):
        for category_i, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = train_text_category_labels[text_i][category_i][paragraph_i]
    elif do_predict and book_id in token_book_id_to_index.keys():
        token_book_index = token_book_id_to_index[book_id]
        tokens = text_paragraph_tokens[token_book_index][paragraph_i]
        train_sequence = tokenizer.texts_to_sequences([tokens])[0]
        x_train = get_input_array(train_sequence)
        y_preds = get_predictions(x_train)
        for category_i, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = y_preds[category_i][0]
    else:
        for toggle_buttons in category_toggle_buttons:
            toggle_buttons.value = 0
    
    # Display toggle buttons.
    for toggle_buttons in category_toggle_buttons:
        IPython.display.display(toggle_buttons)
    
    # Display submit button.
    IPython.display.display(submit_button)
    
    print()
    print('{} (section {:d} of {:d})'.format(section, section_i + 1, len(sections)))
    print()
    print('(paragraph {:d} of {:d})'.format(paragraph_i + 1, len(train_text_paragraphs[text_i])))
    print('='*c_width)
    paragraph_html.value = '<p style="font-size:large;margin-left:8em;max-width:36em;">{}</p>'.format(paragraph)
    IPython.display.display(paragraph_html)
    print('='*c_width)
    
    # Display back button.
    IPython.display.display(back_button)

Start labeling.

In [None]:
display_paragraph_interface()

Save training labels.

In [None]:
for text_i, category_labels in enumerate(train_text_category_labels):
    book_id = train_book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    
    for category_i, category in enumerate(categories):
        section_paragraph_labels = [[] for _ in range(len(train_text_sections[text_i]))]
        for paragraph_i, section_i in enumerate(train_text_section_ids[text_i]):
            label = train_text_category_labels[text_i][category_i][paragraph_i]
            section_paragraph_labels[section_i].append(label)
        fname = folders.FNAME_TEXT_PARAGRAPHS_LABELS_FORMAT.format(category)
        path = os.path.join(folders.AMAZON_KINDLE_TEXT_PATH, asin, fname)
        paragraph_io.write_formatted_section_paragraph_labels(section_paragraph_labels, path)

Save model weights.

In [None]:
for category_i, model in enumerate(models):
    fname = model_fnames[category_i]
    path = os.path.join(MODELS_PATH, fname)
    model.save_weights(path)