## Load this section first

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from importlib import reload
import evaluation_helper

from loadutils import conll2003Data, loadDevPredictionsData

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
TRAIN_FILE = "../data/conll2003/eng.train"
DEV_FILE = "../data/conll2003/eng.testa"
TEST_FILE = "../data/conll2003/eng.testb"

In [183]:
# UPDATES!
global_max_features = 20000
windowLength = 9
#testNumSents = 20000

# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=global_max_features)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/conll2003/eng.train
'readFile'  1004.59 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  785.00 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  1384.53 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testa
'readFile'  195.68 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  254.44 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testb
'readFile'  183.70 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  347.89 ms


## Demo  - Quick access

In [83]:
# Quick access f1
reload(evaluation_helper)
from evaluation_helper import get_f1_by_modelName, compare_models_by_f1

modelName_list = ['encoder_2e_withsaving_again', 'encoder_2e_withsaving_pos']
compare_models_by_f1(modelName_list, y_true=devY, return_results=False)

print ("------------------------------------------------------------")
modelName1 = 'encoder_2e_withsaving_again'
get_f1_by_modelName(modelName1, y_true=devY)


rank 1
modelName: encoder_2e_withsaving_pos
f1= 0.8859607657764123

rank 2
modelName: encoder_2e_withsaving_again
f1= 0.880151139449758
------------------------------------------------------------


0.880151139449758

## Demo  - report class object

In [228]:
reload(evaluation_helper)
from evaluation_helper import EvalDev_Report

In [5]:
modelName = 'encoder_2e_withsaving_again'
dev_raw_y_pred, dev_raw_y_pred_decoder_embeddings, dev_y_pred = loadDevPredictionsData(modelName)

# gold labels
print ("devY",devY.shape)
# raw predictions made by a trained model on dev set
print ("dev_raw_y_pred", dev_raw_y_pred.shape)
# dev prediction labels
print ("dev_y_pred", dev_y_pred.shape)
# decoder on, decoder dev predictions
# decoder off, empty
print ("dev_raw_y_pred_decoder_embeddings",dev_raw_y_pred_decoder_embeddings.shape)

devY (51362,)
dev_raw_y_pred (51362, 8)
dev_y_pred (51362,)
dev_raw_y_pred_decoder_embeddings (51362, 50)


In [229]:
%%capture
# construct report object
report_obj = evaluation_helper.EvalDev_Report(modelName=modelName, y_true=devY, raw_y_pred=dev_raw_y_pred, y_pred=dev_y_pred) 
report_obj.connect_to_dataClass(vocabData)
report_obj.connect_to_devData(devData=(devX, devX_pos, devX_capitals, devY))

# just FYI -- can ignore
report_obj.recall
report_obj.precision
report_obj.f1
report_obj.gold_cts # gold label distribution
report_obj.pred_cts # predicted label distribution
# when gold is "O", but model thinks there is a NER tag
report_obj.hallucination_idx 
# when gold is a NER tag, but model think it is "O"
report_obj.missed_ner_idx
# when both model and gold indicate a NER tag, and the tags matches
report_obj.match_ner_idx
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.mismatch_ner_idx 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 
# dictionary[gold_label][prediction_label] --> count
report_obj.gold_pred_ct_dict 

In [230]:
report_obj.print_brief_summary()

Model     encoder_2e_withsaving_again
Precision 0.8943011397720456
Recall    0.8664419388585377
f1 score  0.880151139449758

Gold NER label counts:
42759 : ['O'] 3
2092 : ['I-ORG'] 5
2094 : ['I-LOC'] 6
1264 : ['I-MISC'] 7
3149 : ['I-PER'] 4
4 : ['B-MISC'] 8

Predicted NER label counts:
43027 : ['O'] 3
2226 : ['I-LOC'] 6
1005 : ['I-MISC'] 7
3241 : ['I-PER'] 4
1863 : ['I-ORG'] 5


In [198]:
# when gold is "O", but model thinks there is a NER tag
report_obj.print_idxlist_to_textlists(idx_list=report_obj.hallucination_idx, return_indices=False)

indices counts = 158

ID 685
FEATURES:   ".", ., noinfo
Gold NER    ['O']
Pred NER    ['I-ORG']
Text window ['cricket', '-', 'DGDGDGDG', '<unk>', '<unk>', '.', '</s>', '</s>', '</s>']
PoS window  ['NNP', ':', 'CD', 'NNP', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['allCaps', 'noinfo', 'noinfo', 'allCaps', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 1270
FEATURES:   "tournament", NNP, allCaps
Gold NER    ['O']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', 'basketball', '-', 'international', 'tournament', 'result', '.', '</s>']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', 'NNP', '.', '</s>']
Caps window ['<s>', '<s>', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'noinfo', '</s>']

ID 1271
FEATURES:   "result", NNP, allCaps
Gold NER    ['O']
Pred NER    ['I-MISC']
Text window ['<s>', 'basketball', '-', 'international', 'tournament', 'result', '.', '</s>', '</s>']
PoS window  ['<s>', 'NNP', ':', 'NNP', 'NNP', 'NNP', '.', '</s>', '</s>']
Caps window ['<s>', '

NameError: name 'return_list' is not defined

In [199]:
# when gold is a NER tag, but model think it is "O"
report_obj.print_idxlist_to_textlists(idx_list=report_obj.missed_ner_idx, return_indices=False)

indices counts = 426

ID 224
FEATURES:   "had", VBD, lowercase
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['<unk>', 'advantage', 'but', 'off-spinner', 'such', 'had', '<unk>', 'their', 'hopes']
PoS window  ['JJ', 'NN', 'CC', 'JJ', 'JJ', 'VBD', 'VBN', 'PRP$', 'NNS']
Caps window ['noinfo', 'lowercase', 'lowercase', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 350
FEATURES:   "tom", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['<s>', '<s>', '<s>', '<s>', 'australian', 'tom', 'moody', 'took', 'six']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'NNP', 'VBD', 'CD']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase']

ID 421
FEATURES:   "fast", JJ, lowercase
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['from', 'paul', 'johnson', 'but', '<unk>', 'fast', 'bowler', 'martin', '<unk>']
PoS window  ['IN', 'NNP', 'NNP', 'CC', 'JJ', 'JJ', 'NN', 'NNP

Text window ['open', 'on', 'road', 'for', 'DGDGDGDG', 'fed', 'cup', '.', '</s>']
PoS window  ['IN', 'IN', 'NNP', 'IN', 'CD', 'NNP', 'NNP', '.', '</s>']
Caps window ['allCaps', 'allCaps', 'allCaps', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'noinfo', '</s>']

ID 35107
FEATURES:   "cup", NNP, allCaps
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['on', 'road', 'for', 'DGDGDGDG', 'fed', 'cup', '.', '</s>', '</s>']
PoS window  ['IN', 'NNP', 'IN', 'CD', 'NNP', 'NNP', '.', '</s>', '</s>']
Caps window ['allCaps', 'allCaps', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'noinfo', '</s>', '</s>']

ID 35397
FEATURES:   "to", TO, lowercase
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['joe', 'fernandez', 'have', 'forced', 'king', 'to', 'take', 'a', 'wait']
PoS window  ['NNP', 'NNP', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN']
Caps window ['upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 35629
FEATURES:  

NameError: name 'return_list' is not defined

In [200]:
# when both model and gold indicate a NER tag, and the tags matches
report_obj.print_idxlist_to_textlists(idx_list=report_obj.match_ner_idx, return_indices=False)

indices counts = 7454

ID 11
FEATURES:   "DGDGDGDG-DGDG-DGDG", CD, noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'london', 'DGDGDGDG-DGDG-DGDG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'CD', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 14
FEATURES:   "all-rounder", NN, noinfo
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', '<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP', 'VBD']
Caps window ['<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase']

ID 16
FEATURES:   "simmons", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took', 'four', 'for']
PoS window  ['<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP',

PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', '(']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo']

ID 1332
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'romania', '-', '<unk>', '<unk>', '(', '31st', ')']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', '(', 'CD', ')']
Caps window ['<s>', '<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo']

ID 1333
FEATURES:   "(", (, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', 'romania', '-', '<unk>', '<unk>', '(', '31st', ')', ',']
PoS window  ['<s>', 'NNP', ':', 'NNP', 'NNP', '(', 'CD', ')', ',']
Caps window ['<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 1338
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['(', '31st', ')',

PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'CD', 'CD', 'CD']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'allCaps', 'noinfo', 'noinfo', 'noinfo']

ID 2874
FEATURES:   "DGDG", CD, noinfo
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', 'san', 'francisco', 'DGDG', 'DGDG', '.DGDGDG', 'DGDG']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'CD', 'CD', 'CD', 'CD']
Caps window ['<s>', '<s>', '<s>', 'allCaps', 'allCaps', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 2884
FEATURES:   "at", NNP, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', '<s>', 'atlanta', 'at', 'chicago', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'NNP', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'allCaps', 'allCaps', '</s>', '</s>']

ID 2886
FEATURES:   "</s>", </s>, </s>
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', 'atlanta', 'at', 'chicago', '</s>', '</s>', '

ID 5282
FEATURES:   "(", (, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'DG.', '<unk>', '<unk>', '(', 'burundi', ')', 'DG:DGDG.DGDG']
PoS window  ['<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo']

ID 5284
FEATURES:   ")", ), noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['DG.', '<unk>', '<unk>', '(', 'burundi', ')', 'DG:DGDG.DGDG', '</s>', '</s>']
PoS window  ['CD', 'NNP', 'NNP', '(', 'NNP', ')', 'CD', '</s>', '</s>']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', '</s>', '</s>']

ID 5288
FEATURES:   "tanui", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', 'DG.', 'william', 'tanui', '(', 'kenya', ')']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['<s>', '<s>', '<s>', 'noinfo',

PoS window  ['NN', 'IN', 'NNP', 'CC', 'NNP', 'NNP', 'IN', '</s>', '</s>']
Caps window ['lowercase', 'lowercase', 'upperInitial', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', '</s>', '</s>']

ID 6793
FEATURES:   "on", IN, lowercase
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['between', 'australia', 'and', 'sri', 'lanka', 'on', '</s>', '</s>', '</s>']
PoS window  ['IN', 'NNP', 'CC', 'NNP', 'NNP', 'IN', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'upperInitial', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', '</s>', '</s>', '</s>']

ID 6797
FEATURES:   "</s>", </s>, </s>
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'australia', '</s>', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', '</s>', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', '</s>', '</s>', '</s>', '</s>']

ID 6798
FEATURES:   "waugh", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    [

Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['townsend', ',', 'quinn', ',', "o'neill", '.', '</s>', '</s>', '</s>']
PoS window  ['NNP', ',', 'NNP', ',', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', '</s>', '</s>', '</s>']

ID 10154
FEATURES:   "newsroom", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', '--', 'dublin', 'newsroom', '+DGDGDG', 'DG', 'DGDGDG']
PoS window  ['<s>', '<s>', '<s>', ':', 'NNP', 'NNP', 'CD', 'CD', 'CD']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo']

ID 10155
FEATURES:   "+DGDGDG", CD, noinfo
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '--', 'dublin', 'newsroom', '+DGDGDG', 'DG', 'DGDGDG', 'DGDGDGDG']
PoS window  ['<s>', '<s>', ':', 'NNP', 'NNP', 'CD', 'CD', 'CD', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo

PoS window  ['CD', 'NN', 'IN', 'DT', 'NNP', 'NNP', 'NN', 'IN', 'NNP']
Caps window ['noinfo', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'upperInitial']

ID 14600
FEATURES:   ",", ,, noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<unk>', 'am', 'flight', 'over', '<unk>', ',', 'scotland', '.', '</s>']
PoS window  ['NNP', 'NNP', 'NN', 'IN', 'NNP', ',', 'NNP', '.', '</s>']
Caps window ['upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', '</s>']

ID 14602
FEATURES:   ".", ., noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['flight', 'over', '<unk>', ',', 'scotland', '.', '</s>', '</s>', '</s>']
PoS window  ['NN', 'IN', 'NNP', ',', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', '</s>', '</s>', '</s>']

ID 14625
FEATURES:   "airport", NN, lowercase
Gold NER    ['I-LOC']
Pr

Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['france', 'expels', 'african', ',', 'air', 'france', 'unions', 'protest', '.']
PoS window  ['NNP', 'VBZ', 'JJ', ',', 'NNP', 'NNP', 'NNS', 'NN', '.']
Caps window ['upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'noinfo']

ID 19450
FEATURES:   "DGDGDGDG-DGDG-DGDG", CD, noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'paris', 'DGDGDGDG-DGDG-DGDG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'CD', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 19452
FEATURES:   "on", IN, lowercase
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'france', 'on', 'friday', 'expelled', 'another']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'IN', 'NNP', 'VBD', 'DT']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperIn

Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['europe', 'and', 'the', 'far', 'east', '.', '</s>', '</s>', '</s>']
PoS window  ['NNP', 'CC', 'DT', 'NNP', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['upperInitial', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'noinfo', '</s>', '</s>', '</s>']

ID 23574
FEATURES:   "pacific", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', '<s>', '<unk>', 'pacific', 'is', 'a', 'major']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'VBZ', 'DT', 'JJ']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'upperInitial', 'lowercase', 'lowercase', 'lowercase']

ID 23612
FEATURES:   ".", ., noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['industrial', 'joint', '<unk>', 'in', 'china', '.', '</s>', '</s>', '</s>']
PoS window  ['JJ', 'JJ', 'NNS', 'IN', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial

ID 27233
FEATURES:   ",", ,, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['minutes', 'through', 'captain', 'andy', 'townsend', ',', '20-year-old', 'norwich', 'striker']
PoS window  ['NNS', 'IN', 'NN', 'NNP', 'NNP', ',', 'JJ', 'NNP', 'NN']
Caps window ['lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', 'lowercase']

ID 27236
FEATURES:   "striker", NN, lowercase
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['andy', 'townsend', ',', '20-year-old', 'norwich', 'striker', 'keith', "o'neill", ',']
PoS window  ['NNP', 'NNP', ',', 'JJ', 'NNP', 'NN', 'NNP', 'NNP', ',']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'noinfo']

ID 27238
FEATURES:   "o'neill", NNP, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window [',', '20-year-old', 'norwich', 'striker', 'keith', "o'neill", ',', 'sunderland', 'forward']
PoS window  [',', 'JJ', 

Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['(', 'france', ')', 'beat', 'ainars', 'kiksis', '(', 'latvia', ')']
PoS window  ['(', 'NNP', ')', 'VB', 'NNPS', 'NNP', '(', 'NNP', ')']
Caps window ['noinfo', 'upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo']

ID 28550
FEATURES:   "(", (, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['france', ')', 'beat', 'ainars', 'kiksis', '(', 'latvia', ')', 'DG-DG']
PoS window  ['NNP', ')', 'VB', 'NNPS', 'NNP', '(', 'NNP', ')', 'CD']
Caps window ['upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo']

ID 28552
FEATURES:   ")", ), noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['beat', 'ainars', 'kiksis', '(', 'latvia', ')', 'DG-DG', '</s>', '</s>']
PoS window  ['VB', 'NNPS', 'NNP', '(', 'NNP', ')', 'CD', '</s>', '</s>']
Caps window ['lowercase', 'upperInitial', 'upperInitial', 'noinfo', 'upp

FEATURES:   ")", ), noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['frank', 'nobilo', '(', 'new', 'zealand', ')', 'DGDGDGDGDGDG', '</s>', '</s>']
PoS window  ['NNP', 'NNP', '(', 'NNP', 'NNP', ')', 'CD', '</s>', '</s>']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', '</s>', '</s>']

ID 30012
FEATURES:   "mcginley", NNP, mixedCaps
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', 'DGDG.', 'paul', 'mcginley', '(', 'ireland', ')']
PoS window  ['<s>', '<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'mixedCaps', 'noinfo', 'upperInitial', 'noinfo']

ID 30013
FEATURES:   "(", (, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'DGDG.', 'paul', 'mcginley', '(', 'ireland', ')', 'DGDGDGDGDGDG']
PoS window  ['<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'up

Caps window ['upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', 'upperInitial']

ID 32118
FEATURES:   "beat", NN, allCaps
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', 'soccer', '-', 'canada', 'beat', 'panama', 'DG-DG', 'in']
PoS window  ['<s>', '<s>', 'NN', ':', 'NNP', 'NN', 'NNP', 'CD', 'IN']
Caps window ['<s>', '<s>', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'noinfo', 'allCaps']

ID 32120
FEATURES:   "DG-DG", CD, noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['soccer', '-', 'canada', 'beat', 'panama', 'DG-DG', 'in', 'world', 'cup']
PoS window  ['NN', ':', 'NNP', 'NN', 'NNP', 'CD', 'IN', 'NN', 'RP']
Caps window ['allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps']

ID 32123
FEATURES:   "cup", RP, allCaps
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['beat', 'panama', 'DG-DG', 'in', 'world', 'cup', 'qualifier', '.', '</

Pred NER    ['I-MISC']
Text window ['semifinals', 'of', 'the', 'portuguese', 'open', 'in', 'DGDGDGDG', ',', 'said']
PoS window  ['NNS', 'IN', 'DT', 'NNP', 'NNP', 'IN', 'CD', ',', 'VBD']
Caps window ['lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'noinfo', 'noinfo', 'lowercase']

ID 34456
FEATURES:   "but", CC, lowercase
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['i', 'have', 'nothing', 'against', 'jansher', 'but', 'it', 'will', 'be']
PoS window  ['PRP', 'VBP', 'NN', 'IN', 'NNP', 'CC', 'PRP', 'MD', 'VB']
Caps window ['allCaps', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 34470
FEATURES:   ".", ., noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['him', ',', '"', 'said', 'eyles', '.', '"', '</s>', '</s>']
PoS window  ['PRP', ',', '"', 'VBD', 'NNPS', '.', '"', '</s>', '</s>']
Caps window ['lowercase', 'noinfo', 'noinfo', 'lowercase', 'upperInitial', 'noinfo',

FEATURES:   "for", IN, lowercase
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['but', 'struck', 'out', 'brent', '<unk>', 'for', 'his', '31st', 'save']
PoS window  ['CC', 'VBD', 'RP', 'NNP', 'NNP', 'IN', 'PRP$', 'JJ', 'VB']
Caps window ['lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'noinfo', 'lowercase']

ID 36944
FEATURES:   "'", POS, noinfo
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['the', 'loss', 'was', 'the', 'mets', "'", 'eighth', 'straight', ',']
PoS window  ['DT', 'NN', 'VBD', 'DT', 'NNPS', 'POS', 'JJ', 'JJ', ',']
Caps window ['upperInitial', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'noinfo']

ID 36964
FEATURES:   "valentine", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['DG-DG', 'under', 'new', 'manager', 'bobby', 'valentine', '.', '</s>', '</s>']
PoS window  ['NN', 'IN', 'JJ', 'NN', 'NNP', 'NNP', '.', '</s>', '</s>']
Caps wi

Text window [',', 'DG', '-', '<unk>', '<unk>', '(', 'DGDG', '-', '<unk>']
PoS window  [',', 'CD', ':', 'NNP', 'NNP', '(', 'CD', ':', 'NNP']
Caps window ['noinfo', 'noinfo', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'upperInitial']

ID 39086
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<unk>', '(', 'DGDG', '-', '<unk>', '<unk>', ',', '57th', ')']
PoS window  ['NNP', '(', 'CD', ':', 'NNP', 'NNP', ',', 'NNP', ')']
Caps window ['upperInitial', 'noinfo', 'noinfo', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo']

ID 39087
FEATURES:   ",", ,, noinfo
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['(', 'DGDG', '-', '<unk>', '<unk>', ',', '57th', ')', ',']
PoS window  ['(', 'CD', ':', 'NNP', 'NNP', ',', 'NNP', ')', ',']
Caps window ['noinfo', 'noinfo', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 39094
FEATURES:   "<unk>", NNP, upperInitial


Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 41845
FEATURES:   "troop", NN, lowercase
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['an', 'escalating', 'crisis', 'over', 'iraqi', 'troop', 'movements', 'in', 'northern']
PoS window  ['DT', 'VBG', 'NN', 'IN', 'JJ', 'NN', 'NNS', 'IN', 'JJ']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 41850
FEATURES:   ".", ., noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['troop', 'movements', 'in', 'northern', 'iraq', '.', '</s>', '</s>', '</s>']
PoS window  ['NN', 'NNS', 'IN', 'JJ', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', '</s>', '</s>', '</s>']

ID 41855
FEATURES:   "accused", VBD, lowercase
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', 'o

Pred NER    ['I-LOC']
Text window ['of', 'kim', '<unk>', 'to', 'north', 'korea', 'where', 'his', 'family']
PoS window  ['IN', 'NNP', 'NNP', 'TO', 'NNP', 'NNP', 'WRB', 'PRP$', 'NN']
Caps window ['lowercase', 'upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase']

ID 46017
FEATURES:   "where", WRB, lowercase
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['kim', '<unk>', 'to', 'north', 'korea', 'where', 'his', 'family', 'is']
PoS window  ['NNP', 'NNP', 'TO', 'NNP', 'NNP', 'WRB', 'PRP$', 'NN', 'VBZ']
Caps window ['upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 46027
FEATURES:   "cross", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window [',', '"', 'north', 'korean', 'red', 'cross', 'president', 'li', '<unk>']
PoS window  [',', '"', 'NNP', 'NNP', 'NNP', 'NNP', 'NN', 'NNP', 'NNP']
Caps window ['noinfo', 'noinfo', 'upperInitial'

Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['to', 'burundi', ',', 'howard', '<unk>', ',', 'and', 'the', 'sant']
PoS window  ['TO', 'NNP', ',', 'NNP', 'NNP', ',', 'CC', 'DT', 'NNP']
Caps window ['lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'upperInitial']

ID 50083
FEATURES:   "'", POS, noinfo
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<unk>', ',', 'and', 'the', 'sant', "'", '<unk>', 'community', ',']
PoS window  ['NNP', ',', 'CC', 'DT', 'NNP', 'POS', 'NNP', 'NNP', ',']
Caps window ['upperInitial', 'noinfo', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo']

ID 50085
FEATURES:   "community", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['and', 'the', 'sant', "'", '<unk>', 'community', ',', 'an', 'italian']
PoS window  ['CC', 'DT', 'NNP', 'POS', 'NNP', 'NNP', ',', 'DT', 'NNP']
Caps window ['lowercase', 'lowercase', 'upperInitia

NameError: name 'return_list' is not defined

In [201]:
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.print_idxlist_to_textlists(idx_list=report_obj.mismatch_ner_idx, return_indices=False)

indices counts = 723

ID 2
FEATURES:   "take", NNP, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', 'cricket', '-', 'leicestershire', 'take', 'over', 'at', 'top']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', 'IN', 'NNP', 'NNP']
Caps window ['<s>', '<s>', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'allCaps', 'allCaps']

ID 13
FEATURES:   "indian", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial']

ID 27
FEATURES:   "by", IN, lowercase
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window ['friday', 'as', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and']
PoS window  ['NNP', 'IN', 'NNP', 'VBD', 'NNP', 'IN', 'DT', 'NN', 'CC']
Caps

Text window ['republic', 'of', 'ireland', 'beat', 'liechtenstein', 'DG-DG', '(', 'halftime', 'DG-DG']
PoS window  ['NNP', 'IN', 'NNP', 'VBD', 'NNP', 'CD', '(', 'NN', 'JJ']
Caps window ['upperInitial', 'lowercase', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'noinfo', 'lowercase', 'noinfo']

ID 27629
FEATURES:   "in", IN, allCaps
Gold NER    ['I-LOC']
Pred NER    ['I-ORG']
Text window ['soccer', '-', 'slovakia', 'beat', '<unk>', 'in', 'world', 'cup', 'qualifier']
PoS window  ['NN', ':', 'NNP', 'NN', 'NNS', 'IN', 'NN', 'RP', 'VBN']
Caps window ['allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'allCaps', 'allCaps', 'allCaps', 'allCaps']

ID 27780
FEATURES:   "</s>", </s>, </s>
Gold NER    ['I-MISC']
Pred NER    ['I-LOC']
Text window ['the', 'tour', 'of', 'the', 'netherlands', '</s>', '</s>', '</s>', '</s>']
PoS window  ['DT', 'NNP', 'IN', 'DT', 'NNP', '</s>', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'upperInitial', 'lowercase', 'lowercase', 'upperInitial', '</s>',

NameError: name 'return_list' is not defined

In [28]:
report_obj.gold_pred_idx_dict # dictionary[gold_label][prediction_label] --> data indices

defaultdict(<function evaluation_helper.EvalDev_Report.get_gold_pred_idx_dict.<locals>.<lambda>()>,
            {3: defaultdict(list,
                         {3: array([], dtype=int64),
                          4: array([], dtype=int64),
                          5: array([], dtype=int64),
                          6: array([     1,      3,      4, ..., 203615, 203616, 203618]),
                          7: array([    40,     44,     55, ..., 203525, 203535, 203575]),
                          8: array([     5,     30,     33, ..., 203592, 203594, 203605]),
                          9: array([   125,    148,    150, ..., 203482, 203533, 203602]),
                          10: array([   121,    131,    136, ..., 203566, 203584, 203620])}),
             4: defaultdict(list,
                         {3: array([], dtype=int64),
                          4: array([], dtype=int64),
                          5: array([], dtype=int64),
                          6: array([     9,     10,     

## Ignore this section

In [10]:
# !
# Get decoder Y -- 50 dim embedding of center word

train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]

In [11]:
# Get X pos tags

# encoding 1-hot for pos tags
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])

trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)

In [12]:
# Get X capitlization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])

trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)