## Load this section first

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from importlib import reload
import evaluation_helper

from loadutils import conll2003Data, loadDevPredictionsData

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
TRAIN_FILE = "../data/conll2003/eng.train"
DEV_FILE = "../data/conll2003/eng.testa"
TEST_FILE = "../data/conll2003/eng.testb"

global_max_features = 20000
windowLength = 9
#testNumSents = 20000

# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=global_max_features)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/conll2003/eng.train
'readFile'  1391.84 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  1239.63 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  2305.73 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testa
'readFile'  298.97 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  531.71 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testb
'readFile'  279.19 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  591.56 ms


## Demo  - Quick access

In [4]:
reload(evaluation_helper)
from evaluation_helper import get_f1_by_modelName, compare_models_by_f1

print ("------------------------------------------------------------")
modelName1 = 'encoder_2e_withsaving_again'
get_f1_by_modelName(modelName1, y_true=devY)

------------------------------------------------------------


0.88015113944975798

In [6]:
# Quick access f1
reload(evaluation_helper)
from evaluation_helper import get_f1_by_modelName, compare_models_by_f1

modelName_list = ['encoder_2e_withsaving_again', 'encoder_2e_withsaving_again']
compare_models_by_f1(modelName_list, y_true=devY, return_results=False)

print ("------------------------------------------------------------")
modelName1 = 'encoder_2e_withsaving_again'
get_f1_by_modelName(modelName1, y_true=devY)


rank 1
modelName: encoder_2e_withsaving_again
f1= 0.88015113945
------------------------------------------------------------


0.88015113944975798

## Demo  - report class object

In [72]:
reload(evaluation_helper)
from evaluation_helper import EvalDev_Report

In [73]:
# load model predictions
modelName = 'encoder_2e_withsaving_again'
dev_raw_y_pred, dev_raw_y_pred_decoder_embeddings, dev_y_pred = loadDevPredictionsData(modelName)

# construct report object
report_obj = evaluation_helper.EvalDev_Report(modelName=modelName, y_true=devY, raw_y_pred=dev_raw_y_pred, y_pred=dev_y_pred) 
report_obj.connect_to_dataClass(vocabData)
report_obj.connect_to_devData(devData=(devX, devX_pos, devX_capitals, devY))

In [45]:
%%capture
# just FYI -- can ignore this chunk

# gold labels
print ("devY",devY.shape)
# raw predictions made by a trained model on dev set
print ("dev_raw_y_pred", dev_raw_y_pred.shape)
# dev prediction labels
print ("dev_y_pred", dev_y_pred.shape)
# decoder on, decoder dev predictions
# decoder off, empty
print ("dev_raw_y_pred_decoder_embeddings",dev_raw_y_pred_decoder_embeddings.shape)

report_obj.recall
report_obj.precision
report_obj.f1
report_obj.gold_cts # gold label distribution
report_obj.pred_cts # predicted label distribution
# when gold is "O", but model thinks there is a NER tag
report_obj.hallucination_idx 
# when gold is a NER tag, but model think it is "O"
report_obj.missed_ner_idx
# when both model and gold indicate a NER tag, and the tags matches
report_obj.match_ner_idx
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.mismatch_ner_idx 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 
# dictionary[gold_label][prediction_label] --> count
report_obj.gold_pred_ct_dict 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 

In [70]:
report_obj.print_overall_rank(worst=True, k=5, print_window=True)

indices counts = 51362
ranked by worst cross-entropy loss


KeyboardInterrupt: 

In [74]:
report_obj.print_rank_per_gold_label(worst=True, k=2, print_window=True)

----------------------------
Label O (tag3)
indices counts = 42759
ranked by worst cross-entropy loss
top 2 results

ID 32072
KL divergence 8.6408109664917
FEATURES:   "jr", NNP, upperInitial
Gold NER    ['O']
Pred NER    ['I-PER']
Text window ['<s>', 'DG.', 'al', '<unk>', 'jr', '(', 'u.s.', ')', ',']
PoS window  ['<s>', 'CD', 'NNP', 'NNP', 'NNP', '(', 'NNP', ')', ',']
Caps window ['<s>', 'noinfo', 'upperInitial', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 48287
KL divergence 6.297656059265137
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['O']
Pred NER    ['I-PER']
Text window ['bag', 'safety', ':', 'everyone', '<unk>', ',', 'kids', 'in', 'back']
PoS window  ['NNP', 'NNP', ':', 'NN', 'NNP', ',', 'NNPS', 'IN', 'RB']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial']
----------------------------
Label I-PER (tag4)
indices counts = 3149
ranked by worst cross-en

In [9]:
report_obj.print_brief_summary()

Model     encoder_2e_withsaving_again
Precision 0.8943011397720456
Recall    0.8664419388585377
f1 score  0.880151139449758

Gold NER label counts:
42759 : ['O'] (tag3)
2092 : ['I-ORG'] (tag5)
2094 : ['I-LOC'] (tag6)
1264 : ['I-MISC'] (tag7)
3149 : ['I-PER'] (tag4)
4 : ['B-MISC'] (tag8)

Predicted NER label counts:
43027 : ['O'] (tag3)
2226 : ['I-LOC'] (tag6)
1005 : ['I-MISC'] (tag7)
3241 : ['I-PER'] (tag4)
1863 : ['I-ORG'] (tag5)


In [10]:
print ("these two tags were never used")
print (report_obj.nerTags.ids_to_words([9]))
print (report_obj.nerTags.ids_to_words([10]))

these two tags were never used
['B-ORG']
['B-LOC']


In [116]:
report_obj.print_gold_to_pred_counts(return_dict=False)


Gold label "['O']" (tag3), prediction label counts:
42601 (0.9963%): "['O']" (tag3)
40 (0.0009%): "['I-PER']" (tag4)
75 (0.0018%): "['I-ORG']" (tag5)
11 (0.0003%): "['I-LOC']" (tag6)
32 (0.0007%): "['I-MISC']" (tag7)
0 (0.0%): "['B-MISC']" (tag8)
0 (0.0%): "['B-ORG']" (tag9)
0 (0.0%): "['B-LOC']" (tag10)

Gold label "['I-PER']" (tag4), prediction label counts:
45 (0.0143%): "['O']" (tag3)
3021 (0.9594%): "['I-PER']" (tag4)
64 (0.0203%): "['I-ORG']" (tag5)
17 (0.0054%): "['I-LOC']" (tag6)
2 (0.0006%): "['I-MISC']" (tag7)
0 (0.0%): "['B-MISC']" (tag8)
0 (0.0%): "['B-ORG']" (tag9)
0 (0.0%): "['B-LOC']" (tag10)

Gold label "['I-ORG']" (tag5), prediction label counts:
173 (0.0827%): "['O']" (tag3)
115 (0.055%): "['I-PER']" (tag4)
1579 (0.7548%): "['I-ORG']" (tag5)
186 (0.0889%): "['I-LOC']" (tag6)
39 (0.0186%): "['I-MISC']" (tag7)
0 (0.0%): "['B-MISC']" (tag8)
0 (0.0%): "['B-ORG']" (tag9)
0 (0.0%): "['B-LOC']" (tag10)

Gold label "['I-LOC']" (tag6), prediction label counts:
36 (0.0172%): "

In [140]:
# when gold is "O", but model thinks there is a NER tag
report_obj.print_idxlist_to_textlists(idx_list=report_obj.hallucination_idx, k=2, return_indices=False)

indices counts = 158
ranked by worst cross-entropy loss
top 2 results

ID 32072
KL divergence 8.6408109664917
FEATURES:   "jr", NNP, upperInitial
Gold NER    ['O']
Pred NER    ['I-PER']
Text window ['<s>', 'DG.', 'al', '<unk>', 'jr', '(', 'u.s.', ')', ',']
PoS window  ['<s>', 'CD', 'NNP', 'NNP', 'NNP', '(', 'NNP', ')', ',']
Caps window ['<s>', 'noinfo', 'upperInitial', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 48287
KL divergence 6.297656059265137
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['O']
Pred NER    ['I-PER']
Text window ['bag', 'safety', ':', 'everyone', '<unk>', ',', 'kids', 'in', 'back']
PoS window  ['NNP', 'NNP', ':', 'NN', 'NNP', ',', 'NNPS', 'IN', 'RB']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial']


In [12]:
# when gold is a NER tag, but model think it is "O"
report_obj.print_idxlist_to_textlists(idx_list=report_obj.missed_ner_idx, k=5, return_indices=False)

indices counts = 426
ranked by worst cross-entropy loss
top 5 results

ID 40660
KL divergence 8.691455841064453
FEATURES:   "i", PRP, allCaps
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['-', 'belgian', 'king', '<unk>', 'i', 'born', '.', '</s>', '</s>']
PoS window  [':', 'JJ', 'NNP', 'NNP', 'PRP', 'VBN', '.', '</s>', '</s>']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'upperInitial', 'allCaps', 'lowercase', 'noinfo', '</s>', '</s>']

ID 45700
KL divergence 7.806224346160889
FEATURES:   "than", IN, upperInitial
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['<s>', '<s>', '<s>', '<unk>', 'than', ',', 'an', 'elected', 'member']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'IN', ',', 'DT', 'VBN', 'NN']
Caps window ['<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'lowercase']

ID 1517
KL divergence 7.519722938537598
FEATURES:   """, ", noinfo
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['<unk>', '<unk>', 'legend', 'robert

In [6]:
# when both model and gold indicate a NER tag, and the tags matches
report_obj.print_idxlist_to_textlists(idx_list=report_obj.match_ner_idx, k=2, return_indices=False, worst=False)

indices counts = 7454
ranked by best cross-entropy loss
top 2 results

ID 35580
FEATURES:   "naoko", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['(', 'switzerland', ')', 'beat', 'naoko', 'kijimuta', '(', 'japan', ')']
PoS window  ['(', 'NNP', ')', 'VB', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['noinfo', 'upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo']

ID 5975
FEATURES:   "sigurd", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', 'DG.', 'sigurd', 'njerve', '(', 'norway', ')']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo']


In [7]:
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.print_idxlist_to_textlists(idx_list=report_obj.mismatch_ner_idx, k=2, return_indices=False)

indices counts = 723
ranked by worst cross-entropy loss
top 2 results

ID 21875
KL divergence 6.258211135864258
FEATURES:   "j.j.", NNP, noinfo
Gold NER    ['I-ORG']
Pred NER    ['I-PER']
Text window ['competitive', 'pre-sale', 'contributed', 'by', 'j.j.', 'kenny', 'k-sheets', ':', '</s>']
PoS window  ['JJ', 'NN', 'NNP', 'NNP', 'NNP', 'NNP', 'NNS', ':', '</s>']
Caps window ['allCaps', 'noinfo', 'allCaps', 'allCaps', 'noinfo', 'allCaps', 'noinfo', 'noinfo', '</s>']

ID 32032
KL divergence 5.501535415649414
FEATURES:   "<unk>", VB, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-PER']
Text window ['(', 'u.s.', ')', ',', '<unk>', '<unk>', ',', 'DGDG.DGDGDG', '</s>']
PoS window  ['(', 'NNP', ')', ',', 'VB', 'NNP', ',', 'CD', '</s>']
Caps window ['noinfo', 'noinfo', 'noinfo', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', '</s>']


In [15]:
report_obj.gold_pred_idx_dict

defaultdict(<function evaluation_helper.EvalDev_Report.get_gold_pred_idx_dict.<locals>.<lambda>>,
            {3: defaultdict(list,
                         {3: array([    0,     1,     3, ..., 51357, 51358, 51361]),
                          4: array([ 3768,  4673,  7027,  8687,  9358, 11144, 11393, 11571, 12038,
                                 12877, 13964, 14422, 14693, 17451, 18681, 19431, 20193, 21395,
                                 21735, 23921, 25695, 25829, 28205, 28828, 32072, 32344, 34293,
                                 40273, 40333, 40395, 40434, 40445, 40454, 40580, 40616, 40749,
                                 48087, 48116, 48286, 48287]),
                          5: array([  685,  1506,  2993,  4577,  6147,  7489,  7581,  7607,  8188,
                                  9402,  9406,  9436,  9457, 12996, 12997, 13000, 13154, 13169,
                                 14095, 14475, 15786, 17165, 17431, 20124, 20175, 20212, 20660,
                                 20715, 20

In [17]:
# the most unsure predictions by gold tag
# the most common mistake

In [36]:
report_obj.gold_pred_idx_dict

defaultdict(<function evaluation_helper.EvalDev_Report.get_gold_pred_idx_dict.<locals>.<lambda>>,
            {3: defaultdict(list,
                         {3: array([    0,     1,     3, ..., 51357, 51358, 51361]),
                          4: array([ 3768,  4673,  7027,  8687,  9358, 11144, 11393, 11571, 12038,
                                 12877, 13964, 14422, 14693, 17451, 18681, 19431, 20193, 21395,
                                 21735, 23921, 25695, 25829, 28205, 28828, 32072, 32344, 34293,
                                 40273, 40333, 40395, 40434, 40445, 40454, 40580, 40616, 40749,
                                 48087, 48116, 48286, 48287]),
                          5: array([  685,  1506,  2993,  4577,  6147,  7489,  7581,  7607,  8188,
                                  9402,  9406,  9436,  9457, 12996, 12997, 13000, 13154, 13169,
                                 14095, 14475, 15786, 17165, 17431, 20124, 20175, 20212, 20660,
                                 20715, 20

In [34]:
report_obj.gold_to_pred_counts
report_obj.gold_pred_idx_dict
# get idx out of it


AttributeError: 'EvalDev_Report' object has no attribute 'gold_to_pred_counts'

In [56]:
self = report_obj
from collections import Counter, defaultdict

idx_dict = defaultdict(list)

for gold in self.gold_pred_idx_dict:
    idx_dict[gold] = np.hstack(self.gold_pred_idx_dict[gold].values())


    

In [59]:
idx_dict

defaultdict(list,
            {3: array([    0,     1,     3, ..., 46146, 46779, 49426]),
             4: array([  224,   642,  1517, ..., 50977, 38512, 40185]),
             5: array([  712,   713,   788, ..., 46026, 47373, 50687]),
             6: array([  545,   783,   839, ..., 44993, 45121, 47377]),
             7: array([  350,   421,  1239, ..., 50771, 50774, 51231]),
             8: array([42100, 26895, 42654, 50090]),
             9: array([], dtype=int64),
             10: array([], dtype=int64)})

In [50]:
np.hstack(self.gold_pred_idx_dict[4].values()).shape

(3149,)

## Ignore this section

In [10]:
# !
# Get decoder Y -- 50 dim embedding of center word

train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]

In [11]:
# Get X pos tags

# encoding 1-hot for pos tags
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])

trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)

In [12]:
# Get X capitlization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])

trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)