# label_texts

In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import IPython
import ipywidgets as widgets

from classification import ordinal
import folders
from sites.bookcave import bookcave
from text import correlated_words, paragraph_io

## Data

In [None]:
train_book_ids = reversed([
    # Top 60 books with total 'hard' rating.
    'bound-by-an-echo',
    'betrayal-2',
    'fashionably-dead',
    'maid-for-the-rock-star',
    'thats-a-promise',
    'famished',
    'orb-station-zero',
    'thats-a-lie',
    'gypsy-love',
    'burn-the-dark',
    'a-vampires-thirst-victor',
    'forever-yours-3',
    'wrecked',
    'when-its-right',
    'ariels-super-power-of-love',
    'never-letting-go',
    'man-eater',
    'flare',
    'the-evil-and-the-pure',
    'zip-pt-zoingzoings-adventures-in-outer-space',
    'there-are-worse-things-i-could-do',
    'endless-evil',
    'the-mistaken',
    'forever-road',
    'rebel',
    'riftkeepers',
    'sacrifices',
    'protection-sex-revenge-romance',
    'tangled-up-in-you',
    'ready-to-were',
    'the-missing-miss-mori',
    'blood-bound-2',
    'suddenly-us',
    'finding-me',
    'hologram-the-seduction-of-samantha-bowman',
    'torture-mom',
    'circus-of-horror',
    'leather-lace',
    'lustful-lies',
    'highland-wolf-clan-the-reluctant-alpha',
    'dark-thoughts',
    'first-love-oakville-seriesbook-one',
    'coming-home-5',
    'saviour',
    'werecat-the-rearing',
    'clocks-watch',
    'seven-eleven-forgotten-and-other-stories',
    'hush-money-2',
    'lux',
    'lustful-letters',
    'the-thousand-scars-series-book-two',
    'free-the-dark',
    'crowleys-cult',
    'rated-r',
    'nothing-but-darkness',
    'more-than-friends-collection',
    'puppet-master',
    'mortal-showdown',
    'unjustly-destroyed',
    'to-betray-a-master'
])

In [None]:
inputs, Y, categories, category_levels, book_ids, books_df, _, _, categories_df = \
    bookcave.get_data({'paragraphs', 'paragraph_tokens'},
                      only_ids=train_book_ids,
                      return_meta=True)
paragraph_texts, token_texts = inputs['paragraphs'], inputs['paragraph_tokens']
len(paragraph_texts), len(token_texts)

### Paragraphs and Tokens

In [None]:
text_paragraphs, text_section_ids, text_sections = zip(*paragraph_texts)
text_paragraph_tokens, _ = zip(*token_texts)
len(text_paragraphs)

In [None]:
book_id_to_index = {book_id: i for i, book_id in enumerate(book_ids)}

In [None]:
first_category_rows = categories_df[categories_df['category'] == categories[0]]
rating_names = [first_category_rows.iloc[i]['rating'] for i in range(len(first_category_rows))]
rating_names

In [None]:
category_descriptions = [list(categories_df[categories_df['category'] == category]['description'])
                         for category in categories]
category_descriptions[0]

### Correlated words

In [None]:
top_n = 64
token_to_category_scores = dict()
for category_i, category in enumerate(categories):
    term_scores = correlated_words.read_formatted_term_scores(category, 6804, 1, 1, 8192, top_n)
    for i, (term, score) in enumerate(term_scores):
#         real_score = score
        real_score = top_n - i
        if term not in token_to_category_scores.keys():
            token_to_category_scores[term] = []
        token_to_category_scores[term].append((category_i, real_score))
token_to_category_scores['shit']

### Training Set

In [None]:
text_category_labels = []
text_paragraph_h = []
for book_id in book_ids:
    i = book_id_to_index[book_id]
    
    # Load or create labels.
    category_labels = []
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']
    for category in categories:
        labels = bookcave.get_labels(asin, category)
        if labels is None:
            labels = [-1]*len(text_paragraphs[i])
        category_labels.append(labels)
    text_category_labels.append(category_labels)
    
    # Calculate maturity heuristic for each paragraph for each text.
    paragraph_tokens = text_paragraph_tokens[i]
    paragraph_h = []
    for tokens in paragraph_tokens:
        h = [0]*len(categories)
        token_set = set(tokens)
        for token in token_set:
            if token in token_to_category_scores.keys():
                for category_i, score in token_to_category_scores[token]:
                    h[category_i] = max(h[category_i], score)
        paragraph_h.append(sum(h))
    text_paragraph_h.append(paragraph_h) 

Flatten.

In [None]:
all_locations = []
all_h = []
for text_i, paragraph_h in enumerate(text_paragraph_h):
    for paragraph_i, h in enumerate(paragraph_h):
        all_locations.append((text_i, paragraph_i))
        all_h.append(h)
all_locations = np.array(all_locations)
all_h = np.array(all_h)
len(all_locations)

In [None]:
sorted_indices = np.argsort(all_h)[::-1]
text_paragraphs[all_locations[sorted_indices[0]][0]][all_locations[sorted_indices[0]][1]]

### Usage

In [None]:
# Dimensions: [text_i], [paragraph_i] = str
text_paragraphs[0][0]

In [None]:
# Dimensions: [text_i], [paragraph_i] = int
text_section_ids[0][0]

In [None]:
# Dimensions: [text_i], [section_id] = str
text_sections[0][text_section_ids[0][0]]

In [None]:
# Dimensions: [text_i], [category_i], [paragraph_i] = int
text_category_labels[0][0][:10]

In [None]:
# Dimensions: [location_i]
all_locations[0]

In [None]:
# Dimensions: [sort_i]
sorted_indices[0]

## Label

In [None]:
# When `false`, interface skips paragraphs that have already been labeled.
# When `true`, label each paragraph, even if it has already been labeled.
do_overwrite = False

Instantiate train index.

In [None]:
sort_i = 0  # len(all_locations)//2
text_i, paragraph_i = all_locations[sorted_indices[sort_i]]
step_size = 10

In [None]:
def move_to_next_paragraph():
    global sort_i
    global text_i
    global paragraph_i
    
    if sort_i == len(sorted_indices) - 1:
        return False
    
    sort_i = min(len(sorted_indices) - 1, sort_i + step_size)
    text_i, paragraph_i = all_locations[sorted_indices[sort_i]]
    return True


def move_to_previous_paragraph():
    global sort_i
    global text_i
    global paragraph_i

    if sort_i == 0:
        return False
    
    sort_i = max(0, sort_i - step_size)
    text_i, paragraph_i = all_locations[sorted_indices[sort_i]]
    return True

Create widgets.

In [None]:
paragraph_html = widgets.HTML(value='')

category_toggle_buttons = []
for category_i, levels in enumerate(category_levels):
    level_descriptions = category_descriptions[category_i]
    options = []
    description = bookcave.CATEGORY_NAMES[categories[category_i]]
    tooltips = []
    for level_i, level in enumerate(levels):
        options.append((rating_names[level_i], level_i))
        split_levels = '\n'.join(level.split('|'))
        level_description = level_descriptions[level_i]
        split_level_descriptions = '\n'.join(level_description.split('|'))
        tooltips.append('{}\n\n{}'.format(split_levels, split_level_descriptions))
    toggle_buttons = widgets.ToggleButtons(
        options=options,
        description=description,
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltips=tooltips
    )
    category_toggle_buttons.append(toggle_buttons)


def on_submit_button_clicked(button):
    # Collect the answers.
    for category_i, levels in enumerate(category_levels):
        y = category_toggle_buttons[category_i].value
        text_category_labels[text_i][category_i][paragraph_i] = y

    moved = move_to_next_paragraph()
    if not do_overwrite:
        while moved and all([text_category_labels[text_i][category_i][paragraph_i] != -1
                             for category_i in range(len(categories))]):
            moved = move_to_next_paragraph()

    if moved:
        display_paragraph_interface()
    else:
        print('Finished training on {:d} books.'.format(len(text_paragraphs)))


submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit the above values as the categorical maturity rating levels for this paragraph.\nThen move on to the next paragraph.',
    icon='check'
)
submit_button.on_click(on_submit_button_clicked)


def on_back_button_clicked(button):
    moved_back = move_to_previous_paragraph()
    if moved_back:
        display_paragraph_interface()
    else:
        print('Cannot move to previous paragraph.')


back_button = widgets.Button(
    description='Back',
    disabled=False,
    button_style='warning',
    tooltip='Move to the previous paragraph.'
)
back_button.on_click(on_back_button_clicked)

Start labeling.

In [None]:
def display_paragraph_interface():
    # Calculate meta data.
    book_id = book_ids[text_i]
    book_index = book_id_to_index[book_id]
    book_title = books_df.iloc[book_index]['title']
    book_authors = books_df.iloc[book_index]['authors']
    book_y = Y[:, book_index]
    
    section_i = text_section_ids[text_i][paragraph_i]
    sections = text_sections[text_i]
    section = sections[section_i]
    paragraph = text_paragraphs[text_i][paragraph_i]
    
    # Clear any previous output in this cell.
    IPython.display.clear_output(wait=True)
    
    # Print meta data.
    c_width = 114
    print('-'*c_width)
    print('{} [book {:d} of {:d}]'.format(book_title, text_i + 1, len(book_ids)))
    print('{}'.format(book_authors))
    print()
    print('Actual categorical rating levels:')
    for category_i, level_i in enumerate(book_y):
        category = categories[category_i]
        level = category_levels[category_i][level_i]
        print('  {:28}: {} ({})'.format(category, rating_names[level_i], level))
    print('-'*c_width)
    print()
    print('What are the categorical maturity rating levels for this paragraph?')
    
    # Update toggle buttons from existing labels or predictions.
    if all([text_category_labels[text_i][category_i][paragraph_i] != -1 for category_i in range(len(categories))]):
        for category_i, toggle_buttons in enumerate(category_toggle_buttons):
            toggle_buttons.value = text_category_labels[text_i][category_i][paragraph_i]
    else:
        for toggle_buttons in category_toggle_buttons:
            toggle_buttons.value = 0
    
    # Display toggle buttons.
    for toggle_buttons in category_toggle_buttons:
        IPython.display.display(toggle_buttons)
    
    # Display submit button.
    IPython.display.display(submit_button)
    
    print()
    print('{} [section {:d} of {:d}]'.format(section, section_i + 1, len(sections)))
    print()
    print('[paragraph {:d} of {:d}]'.format(paragraph_i + 1, len(text_paragraphs[text_i])))
    print('='*c_width)
    paragraph_html.value = '<p style="font-size:large;margin-left:8em;max-width:36em;">{}</p>'.format(paragraph)
    IPython.display.display(paragraph_html)
    print('='*c_width)
    
    # Display back button.
    IPython.display.display(back_button)

In [None]:
display_paragraph_interface()

Save training labels.

In [None]:
for text_i in range(len(text_category_labels)):
    book_id = book_ids[text_i]
    asin = books_df[books_df['id'] == book_id].iloc[0]['asin']

    for category_i, category in enumerate(categories):
        sections = text_sections[text_i]
        section_ids = text_section_ids[text_i]
        labels = text_category_labels[text_i][category_i]
        bookcave.save_labels(asin, category, sections, section_ids, labels, force=True, verbose=0)