In [1]:
%cd ..

/Users/brewer/Code/UU/CS6390/litbank-entities


# demo

In [2]:
import operator
import os

import ipywidgets as widgets
from IPython.display import display

from litbank_entities import extract, linguistics, litbank
from litbank_entities.model import hmm_recognizer

2023-03-14 10:10:09.976835: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
MODELS_DIR = 'models'
MODEL_NAME = 'demo'

## Load data, language utility

In [4]:
text_sentence_tokens, text_sentence_labels = litbank.get_text_sentence_tokens_labels()
sentence_tokens, sentence_labels = litbank.flatten_texts(text_sentence_tokens, text_sentence_labels)
sentence_tokens, sentence_labels = linguistics.process(sentence_tokens, sentence_labels)
sentence_tokens, sentence_labels = litbank.split_large_sentences(sentence_tokens, sentence_labels)
assert len(sentence_tokens) == len(sentence_labels)
print('Train sentences: {:d}'.format(len(sentence_tokens)))

Train sentences: 8567


In [5]:
nlp = linguistics.get_nlp()

## Train

In [6]:
model_options = [
    'CRF (Conditional Random Field)',
    'HMM (Hidden Markov Model using only tokens)',
    'ZeroR (dummy baseline)'
]
model_names = ['crf', 'hmm', 'zero']
model_option_to_name = {model_options[i]: model_names[i] for i in range(len(model_options))}

model_dropdown = widgets.Dropdown(
    options=model_options,
    value=model_options[0],
    description='Model:',
    disabled=False,
)
display(model_dropdown)

Dropdown(description='Model:', options=('CRF (Conditional Random Field)', 'HMM (Hidden Markov Model using only…

In [7]:
classname = model_option_to_name[model_dropdown.value]
categories = litbank.ENTITY_CATEGORIES
resources = extract.create_model_resources(classname)
model = extract.create_model(classname, categories, resources)

model_dir = os.path.join(MODELS_DIR, classname)
os.makedirs(model_dir, exist_ok=True)
demo_dir = os.path.join(model_dir, MODEL_NAME)
if not os.path.exists(demo_dir) or \
        any(not os.path.exists(os.path.join(demo_dir, 'model_{}'.format(category))) for category in categories):
    print('Training model...')
    model.train(sentence_tokens, sentence_labels)
    print('Done.')
    model.save_model(demo_dir)
    print('Saved model.')
else:
    print('Loaded model.')
    model.load_model(demo_dir)

Loaded model.


2023-03-14 10:11:02.461145: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Infer

In [8]:
pearl_harbor = \
    'We have witnessed this morning the distant view a brief full battle of Pearl Harbor' \
    ' and the severe bombing of Pearl Harbor by enemy planes, undoubtedly Japanese.\n' \
    'The city of Honolulu has also been attacked and considerable damage done.'
sentence_textarea = widgets.Textarea(
    value=pearl_harbor,
    placeholder='Sentences separated by newline characters.',
    description='Find entities:',
    layout={'height': '16em'},
)
display(sentence_textarea)

Textarea(value='We have witnessed this morning the distant view a brief full battle of Pearl Harbor and the se…

In [9]:
test_sentence_tokens = [list(map(str, nlp(sentence))) for sentence in sentence_textarea.value.split('\n')]
test_sentence_preds = model.predict(test_sentence_tokens)

token_spaces = 18
print(' ' * token_spaces, end='')
for category in categories:
    print(' {:<5}'.format(category), end='')
print()
for s, tokens in enumerate(test_sentence_tokens):
    preds = test_sentence_preds[s]
    for i, token in enumerate(tokens):
        print('{:>{w}}'.format(token, w=token_spaces), end='')
        for nest_pred in preds[i]:
            print(' {:<5}'.format(nest_pred), end='')
        print()

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
                   PER   FAC   GPE   LOC   VEH   ORG  
                We O     O     O     O     O     O    
              have O     O     O     O     O     O    
         witnessed O     O     O     O     O     O    
              this O     O     O     O     O     O    
           morning O     O     O     O     O     O    
               the O     O     O     O     O     O    
           distant O     O     O     O     O     O    
              view O     O     O     O     O     O    
                 a O     O     O     O     O     O    
             brief O     O     O     I-LOC O     O    
              full O     O     O     I-LOC O     O    
            battle O     O     O     I-LOC O     O    
                of O     O     O     I-LOC O     O    
             Pearl B-PER O   