In [1]:
%cd ..

/Users/brewer/Code/UU/CS6390/litbank-entities


# demo

In [2]:
import operator
import os

import ipywidgets as widgets
from IPython.display import display

from litbank_entities import extract, linguistics, litbank

2023-04-10 13:20:41.614170: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [3]:
MODELS_DIR = 'models'
MODEL_NAME = 'demo'

## Load data, language utility

In [4]:
text_sentence_tokens, text_sentence_labels = litbank.get_text_sentence_tokens_labels()
sentence_tokens, sentence_labels = litbank.flatten_texts(text_sentence_tokens, text_sentence_labels)
sentence_tokens, sentence_labels = linguistics.process(sentence_tokens, sentence_labels)
sentence_tokens, sentence_labels = litbank.split_large_sentences(sentence_tokens, sentence_labels)
assert len(sentence_tokens) == len(sentence_labels)
print('Train sentences: {:d}'.format(len(sentence_tokens)))

Train sentences: 8567


In [5]:
nlp = linguistics.get_nlp()

## Train

In [6]:
model_options = [
    'DistilBERT (Transformer-based, condensed)',
    'CRF (Conditional Random Field)',
    'HMM (Hidden Markov Model using only tokens)',
    'ZeroR (dummy baseline)'
]
model_names = ['bert', 'crf', 'hmm', 'zero']
model_option_to_name = {model_options[i]: model_names[i] for i in range(len(model_options))}

model_dropdown = widgets.Dropdown(
    options=model_options,
    value=model_options[0],
    description='Model:',
    disabled=False,
    layout={'width': '36em'},
)
display(model_dropdown)

Dropdown(description='Model:', layout=Layout(width='36em'), options=('DistilBERT (Transformer-based, condensed…

In [7]:
classname = model_option_to_name[model_dropdown.value]
categories = ('PER',)  # litbank.ENTITY_CATEGORIES
resources = extract.create_model_resources(classname)
model = extract.create_model(classname, categories, resources)

model_dir = os.path.join(MODELS_DIR, classname)
demo_dir = os.path.join(model_dir, MODEL_NAME)
os.makedirs(demo_dir, exist_ok=True)
if not os.path.exists(demo_dir) or \
        any(not os.path.exists(os.path.join(demo_dir, 'model_{}'.format(category))) for category in categories):
    print('Training model...')
    model.train(sentence_tokens, sentence_labels)
    print('Done.')
    model.save_model(demo_dir)
    print('Saved model.')
else:
    model.load_model(demo_dir)
    print('Loaded model.')

All model checkpoint layers were used when initializing TFDistilBertForTokenClassification.

All the layers of TFDistilBertForTokenClassification were initialized from the model checkpoint at models/bert/demo/model_PER.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


Loaded model.


## Infer

In [8]:
pearl_harbor = \
    'We have witnessed this morning the distant view a brief full battle of Pearl Harbor' \
    ' and the severe bombing of Pearl Harbor by enemy planes, undoubtedly Japanese.\n' \
    'The city of Honolulu has also been attacked and considerable damage done.'
sentence_textarea = widgets.Textarea(
    value=pearl_harbor,
    placeholder='Sentences separated by newline characters.',
    description='Find entities:',
    layout={'height': '18em', 'width': '60em'},
)
display(sentence_textarea)

Textarea(value='We have witnessed this morning the distant view a brief full battle of Pearl Harbor and the se…

In [9]:
test_sentence_tokens = [list(map(str, nlp(sentence))) for sentence in sentence_textarea.value.split('\n')]
test_sentence_preds = model.predict(test_sentence_tokens)

2023-04-10 13:20:58.743233: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [2,31]
	 [[{{node Placeholder/_1}}]]




In [10]:
category_test_sentence_phrases = litbank.get_category_sentence_phrases(test_sentence_preds, categories=categories)
for i, preds in enumerate(test_sentence_preds):
    tokens = test_sentence_tokens[i]
    print('({:d}) {}'.format(i + 1, tokens))
    print()
    for k, category in enumerate(categories):
        print('    Phrases ({}):'.format(category))
        for j, phrase in enumerate(category_test_sentence_phrases[k][i]):
            start, end = phrase[:2]
            print('        [{:d}] ({:d}, {:d}): {}'.format(j + 1, start, end, tokens[start:end]))
        print()
    print()

(1) ['We', 'have', 'witnessed', 'this', 'morning', 'the', 'distant', 'view', 'a', 'brief', 'full', 'battle', 'of', 'Pearl', 'Harbor', 'and', 'the', 'severe', 'bombing', 'of', 'Pearl', 'Harbor', 'by', 'enemy', 'planes,', 'undoubtedly', 'Japanese.']

    Phrases (PER):


(2) ['The', 'city', 'of', 'Honolulu', 'has', 'also', 'been', 'attacked', 'and', 'considerable', 'damage', 'done.']

    Phrases (PER):


