# label_texts

In [None]:
import os
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import keras
import IPython
import ipywidgets as widgets

from classification import ordinal, paragraph_rnn
import folders
from sites.bookcave import bookcave
from text import paragraph_io

In [None]:
MODELS_PATH = os.path.join('models')
GLOVE_100_PATH = os.path.join('..', '..', 'embeddings', 'glove.6B.100d.txt')

## Data

In [None]:
text_min_len = 250
text_max_len = 7500
only_categories = None
if only_categories:
    category_names = [bookcave.CATEGORY_NAMES[category_i] for category_i in only_categories]
else:
    category_names = bookcave.CATEGORY_NAMES

### Paragraphs

In [None]:
paragraph_inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df =\
    bookcave.get_data({'text'},
                      text_source='paragraphs',
                      text_min_len=text_min_len,
                      text_max_len=text_max_len,
                      only_categories=only_categories,
                      return_meta=True)
paragraph_texts = paragraph_inputs['text']
text_paragraphs, text_section_ids, text_sections = [], [], []
for paragraphs, section_ids, sections in paragraph_texts:
    text_paragraphs.append(paragraphs)
    text_section_ids.append(section_ids)
    text_sections.append(sections)
len(text_paragraphs)

In [None]:
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

In [None]:
first_category_rows = categories_df[categories_df['category'] == categories[0]]
rating_names = [first_category_rows.iloc[i]['rating'] for i in range(len(first_category_rows))]
rating_names

In [None]:
category_descriptions = [list(categories_df[categories_df['category'] == category]['description']) for category in categories]
category_descriptions[0]

### Tokens

In [None]:
token_inputs, token_Y, _, _, token_book_ids, _, _, _, _ =\
    bookcave.get_data({'text'},
                      text_source='tokens',
                      text_min_len=text_min_len,
                      text_max_len=text_max_len,
                      only_categories=only_categories,
                      return_meta=True)
token_texts = [text for i, text in enumerate(token_inputs['text']) if token_book_ids[i] in book_ids]
text_paragraph_tokens = [paragraph_tokens for paragraph_tokens, _ in token_texts]
len(text_paragraph_tokens)

In [None]:
token_book_id_to_index = {book_id: i for i, book_id in enumerate(token_book_ids)}

In [None]:
text_all_tokens = []
for paragraph_tokens in text_paragraph_tokens:
    all_tokens = []
    for tokens in paragraph_tokens:
        all_tokens.extend(tokens)
    text_all_tokens.append(all_tokens)
len(text_all_tokens)

In [None]:
def identity(v):
    return v

In [None]:
vectorizer = TfidfVectorizer(
    preprocessor=identity,
    tokenizer=identity,
    analyzer='word',
    token_pattern=None,
    max_features=4096,
    norm='l2',
    sublinear_tf=True)

In [None]:
X = vectorizer.fit_transform(text_all_tokens)

In [None]:
len(vectorizer.vocabulary_)

In [None]:
token_to_chi2 = dict()
top_n = 80
features = np.array(vectorizer.get_feature_names())
for category_i in range(len(categories)):
    y = token_Y[category_i]
    scores, pvals = chi2(X, y)
    indices = np.argsort(scores)
    for j, index in enumerate(indices[-top_n:]):
        feature = features[index]
#         score = scores[index]
        score = top_n - j
        if feature not in token_to_chi2.keys():
            token_to_chi2[feature] = score
        else:
            token_to_chi2[feature] = max(token_to_chi2[feature], score)
token_to_chi2['shit']

### Training Set

In [None]:
train_book_ids = [
    # By total rating:
    'temptation-island',
    'forever-road',
    'never-letting-go',
    'collateral',
    'dark-warrior-alliance-boxset-books-5-8',
    'hell-on-earth',
    'orb-station-zero',
    'dette-chambers-death-journal',
    'book-of-shadows',
    'lustful-letters',
    'masks-of-betrayal-2',
    'in-pain-and-blood-2',
    'werecat-the-rearing',
    'hologram-the-seduction-of-samantha-bowman',
    'highland-wolf-clan-the-reluctant-alpha',
    'the-olympus-killer',
    'collapse',
    'torture-mom',
    'more-than-friends-collection',
    'to-betray-a-master',
    # By ratio of total rating to # of paragraphs:
    'theirs-for-the-night',
    'come-away-with-me',
    'the-wall',
    'prayers-for-the-soul-of-a-dying-star',
    'sweet-melissa-destination-unknown',
    'ahrions-minions',
    'diamond-hustle',
    'ember-of-war-2',
    'werecat-the-rearing',
    'skyline-the-dragon-commander',
    'lustful-lies',
    'sprite-night',
    'winter-thrillz',
    'kismet',
    'gettin-lucky',
    'torture-mom',
    'pleasuring-lady-pennington',
    'cyborg-awakenings',
    'ellies-encounter',
    'borderline'
]
train_text_paragraphs = []
train_text_section_ids = []
train_text_sections = []
train_text_category_labels = []
train_text_paragraph_tokens = []
train_text_paragraph_h = []
for book_id in train_book_ids:
    i = book_id_to_index[book_id]
    train_text_paragraphs.append(text_paragraphs[i])
    train_text_section_ids.append(text_section_ids[i])
    train_text_sections.append(text_sections[i])
    
    # Load or create labels.
    category_labels = []
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    for category in categories:
        labels = bookcave.get_labels(asin, category)
        if labels is None:
            labels = [-1]*len(text_paragraphs[i])
        category_labels.append(labels)
    train_text_category_labels.append(category_labels)
    
    # Calculate maturity heuristic for each paragraph for each text.
    token_i = token_book_id_to_index[book_id]
    paragraph_tokens = text_paragraph_tokens[token_i]
    train_text_paragraph_tokens.append(paragraph_tokens)
    paragraph_h = []
    for tokens in paragraph_tokens:
        h = 0
        token_set = set(tokens)
        for token in token_set:
            if token in token_to_chi2.keys():
                h += token_to_chi2[token]
        paragraph_h.append(h)
    train_text_paragraph_h.append(paragraph_h)

Flatten.

In [None]:
train_locations = []
train_paragraph_h = []
for text_i, paragraph_h in enumerate(train_text_paragraph_h):
    for paragraph_i, h in enumerate(paragraph_h):
        train_locations.append((text_i, paragraph_i))
        train_paragraph_h.append(h)
train_locations = np.array(train_locations)
train_paragraph_h = np.array(train_paragraph_h)

In [None]:
sorted_train_indices = np.argsort(train_paragraph_h)[::-1]
train_text_paragraphs[train_locations[sorted_train_indices[0]][0]][train_locations[sorted_train_indices[0]][1]]

### Usage

In [None]:
# Dimensions: [text_i], [paragraph_i] = str
train_text_paragraphs[0][0]

In [None]:
# Dimensions: [text_i], [paragraph_i] = int
train_text_section_ids[0][0]

In [None]:
# Dimensions: [text_i], [section_id] = str
train_text_sections[0][train_text_section_ids[0][0]]

In [None]:
# Dimensions: [text_i], [category_i], [paragraph_i] = int
train_text_category_labels[0][0][:10]

In [None]:
# Dimensions: [location_i]
train_locations[0]

In [None]:
# Dimensions: [location_i]
sorted_train_indices[0]

## Model

In [None]:
do_predict = False

**The rest of the cells in this section are only necessary if `do_predict == True`.**

Set hyper-parameters.

In [None]:
max_words = 40000

# Less than 1% of paragraphs contain more than 160 tokens.
n_tokens = 160

Tokenize the text.

In [None]:
all_paragraph_tokens = []
for paragraph_tokens in text_paragraph_tokens:
    for tokens in paragraph_tokens:
        all_paragraph_tokens.append(tokens)
len(all_paragraph_tokens), all_paragraph_tokens[42]

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(all_paragraph_tokens)
len(tokenizer.word_index)

Load word embedding.

In [None]:
embed_size, embedding_matrix = paragraph_rnn.get_embedding(tokenizer, GLOVE_100_PATH, max_words)
embedding_matrix.shape

Create model.

In [None]:
hidden_size = 128
dense_size = 64
train_emb = True

models = []
model_weights_fnames = []
for category_i, levels in enumerate(category_levels):
    category = categories[category_i]
    n_classes = len(levels)
    model, weights_fname = paragraph_rnn.create_model(category,
                                                      n_classes,
                                                      n_tokens,
                                                      embedding_matrix,
                                                      hidden_size,
                                                      dense_size,
                                                      train_emb=train_emb)
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    
    path = os.path.join(MODELS_PATH, weights_fname)
    if os.path.exists(path):
        model.load_weights(path)

    models.append(model)

In [None]:
def get_input_array(sequence):
    x = np.zeros((n_tokens,), dtype=np.int32)
    if len(sequence) > n_tokens:
        # Truncate center.
        x[:n_tokens//2] = sequence[:n_tokens//2]
        x[-n_tokens//2:] = sequence[-n_tokens//2:]
    else:
        # Pad beginning ('pre').
        x[-len(sequence):] = sequence
    return x


def get_predictions(x):
    y_preds_ordinal = [model.predict([[x]], batch_size=1) for model in models]
    y_preds = [ordinal.from_multi_hot_ordinal(y_pred_ordinal) for y_pred_ordinal in y_preds_ordinal]
    return x, y_preds

## Label

In [None]:
do_overwrite = False

Instantiate train index.

In [None]:
train_i = 0
text_i, paragraph_i = train_locations[sorted_train_indices[train_i]]

In [None]:
def move_to_next_paragraph():
    global train_i
    global text_i
    global paragraph_i
    
    if train_i == len(sorted_train_indices) - 1:
        return False
    
    train_i += 1    
    text_i, paragraph_i = train_locations[sorted_train_indices[train_i]]
    return True


def move_to_previous_paragraph():
    global train_i
    global text_i
    global paragraph_i

    if train_i == 0:
        return False
    
    train_i -= 1
    text_i, paragraph_i = train_locations[sorted_train_indices[train_i]]
    return True

Create widgets.

In [None]:
paragraph_html = widgets.HTML(value='')

category_toggle_buttons = []
for category_i, levels in enumerate(category_levels):
    level_descriptions = category_descriptions[category_i]
    options = []
    description = category_names[category_i]
    tooltips = []
    for level_i, level in enumerate(levels):
        options.append((rating_names[level_i], level_i))
        split_levels = '\n'.join(level.split('|'))
        level_description = level_descriptions[level_i]
        split_level_descriptions = '\n'.join(level_description.split('|'))
        tooltips.append('{}\n\n{}'.format(split_levels, split_level_descriptions))
    toggle_buttons = widgets.ToggleButtons(
        options=options,
        description=description,
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltips=tooltips
    )
    category_toggle_buttons.append(toggle_buttons)


def on_submit_button_clicked(button):
    # Collect the answers.
    for category_i, levels in enumerate(category_levels):
        y = category_toggle_buttons[category_i].value
        train_text_category_labels[text_i][category_i][paragraph_i] = y

    moved = move_to_next_paragraph()
    if not do_overwrite:
        while moved and all([train_text_category_labels[text_i][category_i][paragraph_i] != -1
                             for category_i in range(len(categories))]):
            moved = move_to_next_paragraph()

    if moved:
        display_paragraph_interface()
    else:
        print('Finished training on {:d} books.'.format(len(train_paragraphs)))


submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit the above values as the categorical maturity rating levels for this paragraph.\nThen move on to the next paragraph.',
    icon='check'
)
submit_button.on_click(on_submit_button_clicked)


def on_back_button_clicked(button):
    moved_back = move_to_previous_paragraph()
    if moved_back:
        display_paragraph_interface()
    else:
        print('Cannot move to previous paragraph.')


back_button = widgets.Button(
    description='Back',
    disabled=False,
    button_style='warning',
    tooltip='Move to the previous paragraph.'
)
back_button.on_click(on_back_button_clicked)

In [None]:
def display_paragraph_interface():
    # Calculate meta data.
    book_id = train_book_ids[text_i]
    book_index = book_id_to_index[book_id]
    book_title = books_df.iloc[book_index]['title']
    book_authors = books_df.iloc[book_index]['authors']
    book_y = Y[:, book_index]
    
    section_i = train_text_section_ids[text_i][paragraph_i]
    sections = train_text_sections[text_i]
    section = sections[section_i]
    paragraph = train_text_paragraphs[text_i][paragraph_i]
    
    # Clear any previous output in this cell.
    IPython.display.clear_output(wait=True)
    
    # Print meta data.
    c_width = 114
    print('-'*c_width)
    print('{} [book {:d} of {:d}]'.format(book_title, text_i + 1, len(train_book_ids)))
    print('{}'.format(book_authors))
    print()
    print('Actual categorical rating levels:')
    for category_i, level_i in enumerate(book_y):
        category = categories[category_i]
        level = category_levels[category_i][level_i]
        print('  {:28}: {} ({})'.format(category, rating_names[level_i], level))
    print('-'*c_width)
    print()
    print('What are the categorical maturity rating levels for this paragraph?')
    
    # Update toggle buttons from existing labels or predictions.
    if all([train_text_category_labels[text_i][category_i][paragraph_i] != -1 for category_i in range(len(categories))]):
        for category_i, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = train_text_category_labels[text_i][category_i][paragraph_i]
    elif do_predict and book_id in token_book_id_to_index.keys():
        token_book_index = token_book_id_to_index[book_id]
        tokens = text_paragraph_tokens[token_book_index][paragraph_i]
        train_sequence = tokenizer.texts_to_sequences([tokens])[0]
        x_train = get_input_array(train_sequence)
        y_preds = get_predictions(x_train)
        for category_i, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = y_preds[category_i][0]
    else:
        for toggle_buttons in category_toggle_buttons:
            toggle_buttons.value = 0
    
    # Display toggle buttons.
    for toggle_buttons in category_toggle_buttons:
        IPython.display.display(toggle_buttons)
    
    # Display submit button.
    IPython.display.display(submit_button)
    
    print()
    print('{} [section {:d} of {:d}]'.format(section, section_i + 1, len(sections)))
    print()
    print('[paragraph {:d} of {:d}]'.format(paragraph_i + 1, len(train_text_paragraphs[text_i])))
    print('='*c_width)
    paragraph_html.value = '<p style="font-size:large;margin-left:8em;max-width:36em;">{}</p>'.format(paragraph)
    IPython.display.display(paragraph_html)
    print('='*c_width)
    
    # Display back button.
    IPython.display.display(back_button)

Start labeling.

In [None]:
display_paragraph_interface()

Save training labels.

In [None]:
for text_i in range(len(train_text_category_labels)):
    book_id = train_book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']

    for category_i, category in enumerate(categories):
        sections = train_text_sections[text_i]
        section_ids = train_text_section_ids[text_i]
        labels = train_text_category_labels[text_i][category_i]
        bookcave.save_labels(asin, category, sections, section_ids, labels, force=True, verbose=1)