## Load this section first

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from importlib import reload
import evaluation_helper
from loadutils import construct_embedding_matrix
from loadutils import conll2003Data, loadDevPredictionsData

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
TRAIN_FILE = "../data/CoNLL-2003_NeuroNER/en/train.txt"
DEV_FILE = "../data/CoNLL-2003_NeuroNER/en/valid.txt"
TEST_FILE = "../data/CoNLL-2003_NeuroNER/en/test.txt"
DEV_LABELS = "eval_labels/devY.npy"
TEST_LABELS = "eval_labels/testY.npy"

## Demo  - Quick access

In [3]:
from evaluation_helper import get_f1_by_modelName, compare_models_by_f1

devY = np.load(DEV_LABELS)
testY = np.load(TEST_LABELS)

modelName_list = ['glove50_learn_drop50_decode1000_caps3_conv5_cos_win7_pos_caps', \
                  'glove50_learn_drop50_decode1000_caps3_conv5_cos_win7_base', \
                  'glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_base', \
                  'glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_pos_caps']
compare_models_by_f1(modelName_list, y_true=devY, return_results=False)

print ("------------------------------------------------------------")
modelName1 = 'glove_learn_dropout_pos_caps'
get_f1_by_modelName(modelName1, y_true=devY)


rank 1
modelName: glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_base
f1= 0.92111423221

rank 2
modelName: glove50_learn_drop50_decode1000_caps3_conv5_cos_win7_pos_caps
f1= 0.918615639027

rank 3
modelName: glove50_learn_drop50_decode1000_caps3_conv5_cos_win7_base
f1= 0.918493271434

rank 4
modelName: glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_pos_caps
f1= 0.916107777386
------------------------------------------------------------
Model Name : glove_learn_dropout_pos_caps
F1 Score   : 0.9021899533675697


0.90218995336756969

## Demo  - Full Report for one model

In [4]:
modelName = 'glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_base'

global_max_features = 20000
windowLength = 9
global_embed_dim = 50

#### Just load this (30 sec)

In [5]:
# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=global_max_features)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

# load embeddings
embedding_matrix = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features, vocabData)


# Get decoder Y -- 50 dim embedding of center word
train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]

----------------------------------------------------
reading file from path ../data/CoNLL-2003_NeuroNER/en/train.txt
'readFile'  1207.47 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  1123.93 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  1889.00 ms
----------------------------------------------------
reading file from path ../data/CoNLL-2003_NeuroNER/en/valid.txt
'readFile'  277.68 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  451.96 ms
----------------------------------------------------
reading file from path ../data/CoNLL-2003_NeuroNER/en/test.txt
'readFile'  248.99 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  548.11 ms
Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B

In [6]:
dev_raw_y_pred, dev_raw_y_pred_decoder_embeddings, dev_y_pred = loadDevPredictionsData(modelName)

# construct report object
report_obj = evaluation_helper.EvalDev_Report(modelName=modelName, y_true=devY, raw_y_pred=dev_raw_y_pred, y_pred=dev_y_pred, \
                                              y_true_decoder=dev_decoderY, y_pred_decoder=dev_raw_y_pred_decoder_embeddings) 
report_obj.connect_to_dataClass(vocabData)
report_obj.connect_to_devData(devData=(devX, devX_pos, devX_capitals, devY))
report_obj.connect_to_embeddding_matrix(embedding_matrix)

#### print full report here

In [8]:
report_obj.print_whole_report(k=2, print_window=True,\
                           gold_to_pred_counts=True, brief_summary=True, \
                           worst_overall=True, worst_ner_mismatch=True, \
                           worst_by_label=True, \
                           worst_hallucinations=True, worst_missed_ner=True, \
                           best_ner_match=True)



# to save this report, do the following in a separate code chunk
# use the following to write this report output to a text file
# _14 means this is the 14th code chunk executed in this notebook
"""
with open('~/xyzzy.txt', 'w+') as f:
                        f.write(_14)
"""


-------------------------BRIEF SUMMARY-------------------------

Model     glove50_learn_drop50_decode1000_caps3_conv3_cos_win7_base
Precision 0.9275191514437242
Recall    0.9147971637800767
f1 score  0.9211142322097379

Gold NER label counts:
42759 : ['O'] (tag3)
1341 : ['B-ORG'] (tag6)
1837 : ['B-LOC'] (tag4)
922 : ['B-MISC'] (tag9)
346 : ['I-MISC'] (tag11)
1842 : ['B-PER'] (tag5)
1307 : ['I-PER'] (tag7)
257 : ['I-LOC'] (tag10)
751 : ['I-ORG'] (tag8)

Predicted NER label counts:
42877 : ['O'] (tag3)
1333 : ['B-ORG'] (tag6)
1891 : ['B-LOC'] (tag4)
866 : ['B-MISC'] (tag9)
272 : ['I-MISC'] (tag11)
1886 : ['B-PER'] (tag5)
1307 : ['I-PER'] (tag7)
273 : ['I-LOC'] (tag10)
657 : ['I-ORG'] (tag8)

----------------GOLD to PREDICTION LABELS COUNTS---------------


Gold label "['O']" (tag3), prediction label counts:
42633 (0.9973%): "['O']" (tag3)
14 (0.0003%): "['B-LOC']" (tag4)
23 (0.0005%): "['B-PER']" (tag5)
33 (0.0008%): "['B-ORG']" (tag6)
5 (0.0001%): "['I-PER']" (tag7)
10 (0.0002%): "['I

top 2 results

ID 40660
KL divergence 5.099338054656982
FEATURES:   "i", PRP, allCaps
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['-', 'belgian', 'king', '<unk>', 'i', 'born', '.', '</s>', '</s>']
PoS window  [':', 'JJ', 'NNP', 'NNP', 'PRP', 'VBN', '.', '</s>', '</s>']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'upperInitial', 'allCaps', 'lowercase', 'noinfo', '</s>', '</s>']

ID 1522
KL divergence 5.0842509269714355
FEATURES:   "<unk>", JJ, upperInitial
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['hands', 'of', 'stone', '"', '<unk>', 'climbs', 'into', 'the', 'ring']
PoS window  ['NNS', 'IN', 'NNP', '"', 'JJ', 'NNS', 'IN', 'DT', 'NN']
Caps window ['upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']
----------------------------
Label I-ORG (tag8)
indices counts = 739
ranked by worst cross-entropy loss
top 2 results

ID 36488
KL divergence 5.876728057861328
FEATURES:   "angeles", NNP, upp

"\nwith open('~/xyzzy.txt', 'w+') as f:\n                        f.write(_14)\n"

#### look at encoder loss here

In [7]:
# run this if decoder is on for CapsNet
report_obj.print_eval_decoder_loss(k=3, nnk=5, print_embeddings=False, print_window=True)


-------------------------WORST RECONSTRUCTIONS-------------------------

displaying top 3 results




ID 20730
reconstruction cosine proximity loss = 0.4873424584434484
reconstruction GOLD word "['filing']"
reconstruction PRED word "['toured', 'celebrate', 'oasis', 'quartering', 'celebrations']"

word_window: ['<s>', '<s>', '<s>', 'ipo', 'filing', '--', '<unk>', '<unk>', 'inc']




ID 21724
reconstruction cosine proximity loss = 0.4650989156958567
reconstruction GOLD word "['issuer']"
reconstruction PRED word "['toured', 'oasis', 'celebrate', 'depopulated', 'revellers']"

word_window: ['<s>', '<s>', '<s>', '<s>', 'issuer', ':', 'bay', 'co', 'building']




ID 3087
reconstruction cosine proximity loss = 0.45533098166510905
reconstruction GOLD word "['jeff']"
reconstruction PRED word "['keenness', 'peacemaker', 'stabilising', 'u.s.-sponsored', 'arab']"

word_window: ['connection', '--', 'bad', 'boy', 'jeff', 'tarango', '.', '</s>', '</s>']

-------------------------BEST RECONSTRUCTIONS-

## Just FYI -- more attributes and methods

In [12]:
%%capture

# gold labels
print ("devY",devY.shape)
# raw predictions made by a trained model on dev set
print ("dev_raw_y_pred", dev_raw_y_pred.shape)
# dev prediction labels
print ("dev_y_pred", dev_y_pred.shape)
# decoder on, decoder dev predictions
# decoder off, empty
print ("dev_raw_y_pred_decoder_embeddings",dev_raw_y_pred_decoder_embeddings.shape)

report_obj.recall
report_obj.precision
report_obj.f1
report_obj.gold_cts # gold label distribution
report_obj.pred_cts # predicted label distribution
# when gold is "O", but model thinks there is a NER tag
report_obj.hallucination_idx 
# when gold is a NER tag, but model think it is "O"
report_obj.missed_ner_idx
# when both model and gold indicate a NER tag, and the tags matches
report_obj.match_ner_idx
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.mismatch_ner_idx 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 
# dictionary[gold_label][prediction_label] --> count
report_obj.gold_pred_ct_dict 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 

In [None]:
# brief_summary
report_obj.print_brief_summary()

# gold_to_pred_counts
report_obj.print_gold_to_pred_counts(return_dict=False)

# worst_overall
report_obj.print_overall_rank(worst=True, k=5, print_window=True)

# worst_ner_mismatch
report_obj.print_idxlist_to_textlists(idx_list=report_obj.mismatch_ner_idx, k=2, return_indices=False)

# worst_by_label
report_obj.print_rank_per_gold_label(worst=True, k=2, print_window=True)

# worst_hallucinations
# when gold is "O", but model thinks there is a NER tag
report_obj.print_idxlist_to_textlists(idx_list=report_obj.hallucination_idx, k=2, return_indices=False)

# worst_missed_ner
# when gold is a NER tag, but model think it is "O"
report_obj.print_idxlist_to_textlists(idx_list=report_obj.missed_ner_idx, k=5, return_indices=False)

# best_ner_match
report_obj.print_idxlist_to_textlists(idx_list=report_obj.match_ner_idx, k=2, worst=False)

## Ignore below