## Load this section first

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from importlib import reload
import evaluation_helper

from loadutils import conll2003Data, loadDevPredictionsData

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
TRAIN_FILE = "../data/conll2003/eng.train"
DEV_FILE = "../data/conll2003/eng.testa"
TEST_FILE = "../data/conll2003/eng.testb"

In [183]:
# UPDATES!
global_max_features = 20000
windowLength = 9
#testNumSents = 20000

# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=global_max_features)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/conll2003/eng.train
'readFile'  1004.59 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  785.00 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  1384.53 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testa
'readFile'  195.68 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  254.44 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testb
'readFile'  183.70 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  347.89 ms


## Demo  - Quick access

In [83]:
# Quick access f1
reload(evaluation_helper)
from evaluation_helper import get_f1_by_modelName, compare_models_by_f1

modelName_list = ['encoder_2e_withsaving_again', 'encoder_2e_withsaving_pos']
compare_models_by_f1(modelName_list, y_true=devY, return_results=False)

print ("------------------------------------------------------------")
modelName1 = 'encoder_2e_withsaving_again'
get_f1_by_modelName(modelName1, y_true=devY)


rank 1
modelName: encoder_2e_withsaving_pos
f1= 0.8859607657764123

rank 2
modelName: encoder_2e_withsaving_again
f1= 0.880151139449758
------------------------------------------------------------


0.880151139449758

## Demo  - report class object

In [232]:
reload(evaluation_helper)
from evaluation_helper import EvalDev_Report

In [233]:
modelName = 'encoder_2e_withsaving_again'
dev_raw_y_pred, dev_raw_y_pred_decoder_embeddings, dev_y_pred = loadDevPredictionsData(modelName)

# gold labels
print ("devY",devY.shape)
# raw predictions made by a trained model on dev set
print ("dev_raw_y_pred", dev_raw_y_pred.shape)
# dev prediction labels
print ("dev_y_pred", dev_y_pred.shape)
# decoder on, decoder dev predictions
# decoder off, empty
print ("dev_raw_y_pred_decoder_embeddings",dev_raw_y_pred_decoder_embeddings.shape)

devY (51362,)
dev_raw_y_pred (51362, 8)
dev_y_pred (51362,)
dev_raw_y_pred_decoder_embeddings (51362, 50)


In [234]:
%%capture
# construct report object
report_obj = evaluation_helper.EvalDev_Report(modelName=modelName, y_true=devY, raw_y_pred=dev_raw_y_pred, y_pred=dev_y_pred) 
report_obj.connect_to_dataClass(vocabData)
report_obj.connect_to_devData(devData=(devX, devX_pos, devX_capitals, devY))

# just FYI -- can ignore
report_obj.recall
report_obj.precision
report_obj.f1
report_obj.gold_cts # gold label distribution
report_obj.pred_cts # predicted label distribution
# when gold is "O", but model thinks there is a NER tag
report_obj.hallucination_idx 
# when gold is a NER tag, but model think it is "O"
report_obj.missed_ner_idx
# when both model and gold indicate a NER tag, and the tags matches
report_obj.match_ner_idx
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.mismatch_ner_idx 
# dictionary[gold_label][prediction_label] --> data indices
report_obj.gold_pred_idx_dict 
# dictionary[gold_label][prediction_label] --> count
report_obj.gold_pred_ct_dict 

In [235]:
report_obj.print_brief_summary()

Model     encoder_2e_withsaving_again
Precision 0.8943011397720456
Recall    0.8664419388585377
f1 score  0.880151139449758

Gold NER label counts:
42759 : ['O'] 3
2092 : ['I-ORG'] 5
2094 : ['I-LOC'] 6
1264 : ['I-MISC'] 7
3149 : ['I-PER'] 4
4 : ['B-MISC'] 8

Predicted NER label counts:
43027 : ['O'] 3
2226 : ['I-LOC'] 6
1005 : ['I-MISC'] 7
3241 : ['I-PER'] 4
1863 : ['I-ORG'] 5


In [236]:
# when gold is "O", but model thinks there is a NER tag
report_obj.print_idxlist_to_textlists(idx_list=report_obj.hallucination_idx, return_indices=False)

indices counts = 158

ID 685
FEATURES:   "<unk>", NNP, allCaps
Gold NER    ['O']
Pred NER    ['I-ORG']
Text window ['cricket', '-', 'DGDGDGDG', '<unk>', '<unk>', '.', '</s>', '</s>', '</s>']
PoS window  ['NNP', ':', 'CD', 'NNP', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['allCaps', 'noinfo', 'noinfo', 'allCaps', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 1270
FEATURES:   "international", NNP, allCaps
Gold NER    ['O']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', 'basketball', '-', 'international', 'tournament', 'result', '.', '</s>']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', 'NNP', '.', '</s>']
Caps window ['<s>', '<s>', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'noinfo', '</s>']

ID 1271
FEATURES:   "tournament", NNP, allCaps
Gold NER    ['O']
Pred NER    ['I-MISC']
Text window ['<s>', 'basketball', '-', 'international', 'tournament', 'result', '.', '</s>', '</s>']
PoS window  ['<s>', 'NNP', ':', 'NNP', 'NNP', 'NNP', '.', '</s>', '</s>']
Caps wi

NameError: name 'return_list' is not defined

In [237]:
# when gold is a NER tag, but model think it is "O"
report_obj.print_idxlist_to_textlists(idx_list=report_obj.missed_ner_idx, return_indices=False)

indices counts = 426

ID 224
FEATURES:   "such", JJ, upperInitial
Gold NER    ['I-PER']
Pred NER    ['O']
Text window ['<unk>', 'advantage', 'but', 'off-spinner', 'such', 'had', '<unk>', 'their', 'hopes']
PoS window  ['JJ', 'NN', 'CC', 'JJ', 'JJ', 'VBD', 'VBN', 'PRP$', 'NNS']
Caps window ['noinfo', 'lowercase', 'lowercase', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 350
FEATURES:   "australian", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['<s>', '<s>', '<s>', '<s>', 'australian', 'tom', 'moody', 'took', 'six']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'NNP', 'VBD', 'CD']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase']

ID 421
FEATURES:   "<unk>", JJ, noinfo
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['from', 'paul', 'johnson', 'but', '<unk>', 'fast', 'bowler', 'martin', '<unk>']
PoS window  ['IN', 'NNP', 'NNP', 'CC', 'JJ', 'JJ', 'N


ID 46207
FEATURES:   "in", IN, lowercase
Gold NER    ['I-ORG']
Pred NER    ['O']
Text window ['pressure', 'group', 'human', 'rights', 'in', 'china', 'said', 'on', 'saturday']
PoS window  ['NN', 'NN', 'NNP', 'NNPS', 'IN', 'NNP', 'VBD', 'IN', 'NNP']
Caps window ['lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'upperInitial']

ID 46299
FEATURES:   "DGDGDGDG-DGDG", CD, noinfo
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['<unk>', 'from', 'the', '<unk>', 'DGDGDGDG-DGDG', 'cultural', 'revolution', '--', 'or']
PoS window  ['NN', 'IN', 'DT', 'JJ', 'CD', 'NNP', 'NNP', ':', 'CC']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'lowercase']

ID 46506
FEATURES:   "york-based", JJ, noinfo
Gold NER    ['I-MISC']
Pred NER    ['O']
Text window ['<unk>', ',', 'the', 'new', 'york-based', 'pressure', 'group', 'human', 'rights']
PoS window  ['NNP', ',', 'DT', 'NNP'

NameError: name 'return_list' is not defined

In [238]:
# when both model and gold indicate a NER tag, and the tags matches
report_obj.print_idxlist_to_textlists(idx_list=report_obj.match_ner_idx, return_indices=False)

indices counts = 7454

ID 11
FEATURES:   "london", NNP, allCaps
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'london', 'DGDGDGDG-DGDG-DGDG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'CD', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 14
FEATURES:   "indian", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', '<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP', 'VBD']
Caps window ['<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase']

ID 16
FEATURES:   "phil", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took', 'four', 'for']
PoS window  ['<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP', 'VBD', 'CD

FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<unk>', ',', '"', '<unk>', '<unk>', 'association', 'president', 'ramon', '<unk>']
PoS window  ['NNS', ',', '"', 'JJ', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP']
Caps window ['lowercase', 'noinfo', 'noinfo', 'upperInitial', 'upperInitial', 'upperInitial', 'upperInitial', 'upperInitial', 'upperInitial']

ID 1706
FEATURES:   "ramon", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<unk>', '<unk>', 'association', 'president', 'ramon', '<unk>', 'said', '.', '</s>']
PoS window  ['JJ', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'VBD', '.', '</s>']
Caps window ['upperInitial', 'upperInitial', 'upperInitial', 'upperInitial', 'upperInitial', 'upperInitial', 'lowercase', 'noinfo', '</s>']

ID 1707
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<unk>', 'association', 'president', 'ramon', '<unk>', 'said', '.', '</s>', '</s>']
PoS window  ['N

FEATURES:   "houston", NNP, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', 'chicago', 'DG', 'houston', 'DG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', 'NNP', 'CD', 'NNP', 'CD', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', 'upperInitial', 'noinfo', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 2968
FEATURES:   "cincinnati", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', '<s>', 'cincinnati', 'DGDG', 'colorado', 'DG', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'CD', 'NNP', 'CD', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'noinfo', 'allCaps', 'noinfo', '</s>']

ID 2970
FEATURES:   "colorado", NNP, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', 'cincinnati', 'DGDG', 'colorado', 'DG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', 'NNP', 'CD', 'NNP', 'CD', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', 'upperInitial', 'n

PoS window  ['<s>', '<s>', 'NNP', 'NNP', 'NNP', '(', 'NNP', ')', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 5412
FEATURES:   "u.s.", NNP, noinfo
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['DG.', 'gwen', 'torrence', '(', 'u.s.', ')', 'DGDG.DGDG', '</s>', '</s>']
PoS window  ['NNP', 'NNP', 'NNP', '(', 'NNP', ')', 'CD', '</s>', '</s>']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo', '</s>', '</s>']

ID 5416
FEATURES:   "mary", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', 'DG.', 'mary', 'onyali', '(', 'nigeria', ')']
PoS window  ['<s>', '<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo']

ID 5417
FEATURES:   "onyali", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>

FEATURES:   "muralitharan", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'DG-DG-DGDG-DG', ',', 'muralitharan', 'DGDG-DG-DGDG-DG', ',', 'jayasuriya', 'DGDG-DG-DGDG-DG']
PoS window  ['<s>', '<s>', 'CD', ',', 'NNP', 'CD', ',', 'NNP', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', 'noinfo']

ID 6911
FEATURES:   "jayasuriya", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window [',', 'muralitharan', 'DGDG-DG-DGDG-DG', ',', 'jayasuriya', 'DGDG-DG-DGDG-DG', ',', '<unk>', '</s>']
PoS window  [',', 'NNP', 'CD', ',', 'NNP', 'CD', ',', 'NNP', '</s>']
Caps window ['noinfo', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'upperInitial', '</s>']

ID 6914
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window [',', 'jayasuriya', 'DGDG-DG-DGDG-DG', ',', '<unk>', '</s>', '</s>', '</s>', '</s>']
PoS window  [',', 'NNP'

PoS window  ['POS', 'NN', 'NN', 'IN', 'NNP', ',', 'NN', 'NN', 'NN']
Caps window ['noinfo', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'lowercase']

ID 10580
FEATURES:   "interfax", NN, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['plan', 'for', 'chechnya', ',', 'interfax', 'news', 'agency', 'said', '.']
PoS window  ['NN', 'IN', 'NNP', ',', 'NN', 'NN', 'NN', 'VBD', '.']
Caps window ['lowercase', 'lowercase', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'noinfo']

ID 10586
FEATURES:   "lebed", NN, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', '"', 'lebed', 'is', 'now', 'in', 'chechnya']
PoS window  ['<s>', '<s>', '<s>', '"', 'NN', 'VBZ', 'RB', 'IN', 'NNP']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'upperInitial']

ID 10590
FEATURES:   "chechnya", NNP, upperInitial
Gold NER    [

Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 13867
FEATURES:   "brunswijk", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', '<s>', 'brunswijk', ',', 'who', 'led', 'a']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', ',', 'WP', 'VBD', 'DT']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'lowercase']

ID 13879
FEATURES:   "desi", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['the', 'military', 'regime', 'of', 'desi', 'bouterse', 'in', 'the', 'late']
PoS window  ['DT', 'JJ', 'NN', 'IN', 'NNP', 'NNP', 'IN', 'DT', 'JJ']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase']

ID 13880
FEATURES:   "bouterse", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['military', 'regime

Pred NER    ['I-ORG']
Text window ['<unk>', '<unk>', 'of', 'boston-based', 'reagan', 'communications', 'that', 'represents', 'two']
PoS window  ['NNP', 'NNP', 'IN', 'JJ', 'NNP', 'NNP', 'WDT', 'VBZ', 'CD']
Caps window ['upperInitial', 'upperInitial', 'lowercase', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase']

ID 18961
FEATURES:   "communications", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<unk>', 'of', 'boston-based', 'reagan', 'communications', 'that', 'represents', 'two', 'of']
PoS window  ['NNP', 'IN', 'JJ', 'NNP', 'NNP', 'WDT', 'VBZ', 'CD', 'IN']
Caps window ['upperInitial', 'lowercase', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 19105
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['port', 'on', 'thursday', ',', '<unk>', 'said', '.', '</s>', '</s>']
PoS window  ['NN', 'IN', 'NNP', ',', 'NNP', 'VBD', '.', '</s>', 

PoS window  ['NNP', 'NNP', ',', 'DT', 'JJ', 'NN', 'WP', 'VBZ', 'DT']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'lowercase', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 23962
FEATURES:   "osce", RB, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['diplomat', 'who', 'heads', 'the', 'osce', 'chechnya', 'mission', ',', 'saying']
PoS window  ['NN', 'WP', 'VBZ', 'DT', 'RB', 'NNP', 'NN', ',', 'VBG']
Caps window ['lowercase', 'lowercase', 'lowercase', 'lowercase', 'allCaps', 'upperInitial', 'lowercase', 'noinfo', 'lowercase']

ID 23963
FEATURES:   "chechnya", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['who', 'heads', 'the', 'osce', 'chechnya', 'mission', ',', 'saying', 'he']
PoS window  ['WP', 'VBZ', 'DT', 'RB', 'NNP', 'NN', ',', 'VBG', 'PRP']
Caps window ['lowercase', 'lowercase', 'lowercase', 'allCaps', 'upperInitial', 'lowercase', 'noinfo', 'lowercase', 'lowercase']

ID 23972
FEATURES:   "<unk>", NNP, 

Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['DGDGDG', 'jose', 'coceres', '(', 'argentina', ')', 'DGDG', 'DGDG', 'DGDG']
PoS window  ['CD', 'NNP', 'NNPS', '(', 'NNP', ')', 'CD', 'CD', 'CD']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 27482
FEATURES:   "joakim", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', '<s>', 'DGDGDG', 'joakim', 'haeggman', '(', 'sweden', ')']
PoS window  ['<s>', '<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'noinfo']

ID 27483
FEATURES:   "haeggman", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'DGDGDG', 'joakim', 'haeggman', '(', 'sweden', ')', 'DGDG']
PoS window  ['<s>', '<s>', 'CD', 'NNP', 'NNP', '(', 'NNP', ')', 'CD']
Caps window ['<s>', '<s>', 'noinfo', 'upperInitial', 'upperIn

PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', '(']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo']

ID 28753
FEATURES:   "mike", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', '<s>', 'england', ':', 'mike', 'atherton', '(', 'captain', ')']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', '(', 'NN', ')']
Caps window ['<s>', '<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'lowercase', 'noinfo']

ID 28754
FEATURES:   "atherton", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<s>', 'england', ':', 'mike', 'atherton', '(', 'captain', ')', ',']
PoS window  ['<s>', 'NNP', ':', 'NNP', 'NNP', '(', 'NN', ')', ',']
Caps window ['<s>', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'lowercase', 'noinfo', 'noinfo']

ID 28759
FEATURES:   "nick", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER'

Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'london', 'DGDGDGDG-DGDG-DGDG', '</s>', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', '</s>', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'allCaps', 'noinfo', '</s>', '</s>', '</s>']

ID 30786
FEATURES:   "english", JJ, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', '<s>', '<s>', 'english', 'soccer', 'league', 'standings', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'JJ', 'NN', 'NN', 'NNS', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', '</s>']

ID 30816
FEATURES:   "stoke", VB, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['<s>', '<s>', '<s>', '<s>', 'stoke', 'DG', 'DG', 'DG', 'DG']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'VB', 'CD', 'CD', 'CD', 'CD']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'no

PoS window  [':', 'NNP', 'NNP', '(', 'NNP', 'NNP', 'NNP', ',', 'CD']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'noinfo']

ID 32770
FEATURES:   "van", NNP, lowercase
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['mark', 'andrews', '(', 'fritz', 'van', '<unk>', ',', 'DGDG', ')']
PoS window  ['NNP', 'NNP', '(', 'NNP', 'NNP', 'NNP', ',', 'CD', ')']
Caps window ['upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'noinfo', 'noinfo']

ID 32771
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['andrews', '(', 'fritz', 'van', '<unk>', ',', 'DGDG', ')', ',']
PoS window  ['NNP', '(', 'NNP', 'NNP', 'NNP', ',', 'CD', ')', ',']
Caps window ['upperInitial', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'noinfo', 'noinfo', 'noinfo']

ID 32778
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pr

PoS window  ['VBN', 'IN', 'DT', 'NNP', 'NNP', 'IN', 'NNP', ':', '</s>']
Caps window ['lowercase', 'lowercase', 'lowercase', 'noinfo', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', '</s>']

ID 35671
FEATURES:   "united", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'united', 'states', 'at', 'netherlands', '</s>']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'IN', 'NNPS', '</s>']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'lowercase', 'upperInitial', '</s>']

ID 35672
FEATURES:   "states", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', 'united', 'states', 'at', 'netherlands', '</s>', '</s>']
PoS window  ['<s>', '<s>', '<s>', 'NNP', 'NNP', 'IN', 'NNPS', '</s>', '</s>']
Caps window ['<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'lowercase', 'upperInitial', '</s>', '</s>']

ID 35674
FEATURES:   "netherlands", NNPS, upperInitial
Go


ID 38140
FEATURES:   "mexico", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['cup', '<unk>', 'france', 'beat', 'mexico', 'DG-DG', 'in', 'a', 'friendly']
PoS window  ['NNP', 'VBZ', 'NNP', 'VBD', 'NNP', 'CD', 'IN', 'DT', 'JJ']
Caps window ['upperInitial', 'lowercase', 'upperInitial', 'lowercase', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'lowercase']

ID 38155
FEATURES:   "france", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['extended', 'to', 'DGDG', 'matches', 'france', "'s", 'unbeaten', 'run', 'under']
PoS window  ['VBN', 'TO', 'CD', 'NNS', 'NNP', 'POS', 'JJ', 'NN', 'IN']
Caps window ['lowercase', 'lowercase', 'noinfo', 'lowercase', 'upperInitial', 'noinfo', 'lowercase', 'lowercase', 'lowercase']

ID 38161
FEATURES:   "aime", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['unbeaten', 'run', 'under', 'coach', 'aime', 'jacquet', ',', 'their', 'euro']
PoS window  ['JJ', 'NN', 'IN', 'NN', 'NN

FEATURES:   "liberal", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['I-MISC']
Text window ['<s>', '<s>', '<s>', 'as', 'liberal', 'prime', 'minister', 'DGDGDGDG-DGDGDGDG', 'he']
PoS window  ['<s>', '<s>', '<s>', 'IN', 'NNP', 'JJ', 'NN', 'CD', 'PRP']
Caps window ['<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'noinfo', 'lowercase']

ID 40460
FEATURES:   "transvaal", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['granted', '<unk>', 'to', 'the', 'transvaal', '.', '</s>', '</s>', '</s>']
PoS window  ['VBD', 'JJ', 'TO', 'DT', 'NNP', '.', '</s>', '</s>', '</s>']
Caps window ['lowercase', 'noinfo', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', '</s>', '</s>', '</s>']

ID 40466
FEATURES:   "house", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-ORG']
Text window ['he', 'also', 'got', 'the', 'house', 'of', 'lords', 'to', 'pass']
PoS window  ['PRP', 'RB', 'VBD', 'DT', 'NNP', 'IN', 'NNPS', 'TO', 'VB']
Caps window 

Pred NER    ['I-PER']
Text window ['as', 'senior', 'leader', '<unk>', '<unk>', '<unk>', 'death', ',', '<unk>']
PoS window  ['IN', 'JJ', 'NN', 'NNP', 'NNP', 'VBZ', 'NN', ',', 'NNP']
Caps window ['lowercase', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'noinfo', 'upperInitial']

ID 44743
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['<unk>', '<unk>', 'death', ',', '<unk>', 'said', 'in', 'an', 'interview']
PoS window  ['NNP', 'VBZ', 'NN', ',', 'NNP', 'VBD', 'IN', 'DT', 'NN']
Caps window ['upperInitial', 'lowercase', 'lowercase', 'noinfo', 'upperInitial', 'lowercase', 'lowercase', 'lowercase', 'lowercase']

ID 44750
FEATURES:   "china", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '"', 'china', 'is', 'going', 'through', 'this']
PoS window  ['<s>', '<s>', '<s>', '"', 'NNP', 'VBZ', 'VBG', 'IN', 'DT']
Caps window ['<s>', '<s>', '<s>', 'noinfo', 'upperIni

Text window ['<s>', 'more', 'than', 'DGDGDG', 'u.s.', 'warplanes', 'and', 'DGDG', 'ships']
PoS window  ['<s>', 'JJR', 'IN', 'CD', 'NNP', 'NN', 'CC', 'CD', 'NNS']
Caps window ['<s>', 'upperInitial', 'lowercase', 'noinfo', 'noinfo', 'lowercase', 'lowercase', 'noinfo', 'lowercase']

ID 48812
FEATURES:   "bill", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['saturday', 'in', 'case', 'president', 'bill', 'clinton', 'ordered', 'the', 'use']
PoS window  ['NNP', 'IN', 'NN', 'NNP', 'NNP', 'NNP', 'VBD', 'DT', 'NN']
Caps window ['upperInitial', 'lowercase', 'lowercase', 'upperInitial', 'upperInitial', 'upperInitial', 'lowercase', 'lowercase', 'lowercase']

ID 48813
FEATURES:   "clinton", NNP, upperInitial
Gold NER    ['I-PER']
Pred NER    ['I-PER']
Text window ['in', 'case', 'president', 'bill', 'clinton', 'ordered', 'the', 'use', 'of']
PoS window  ['IN', 'NN', 'NNP', 'NNP', 'NNP', 'VBD', 'DT', 'NN', 'IN']
Caps window ['lowercase', 'lowercase', 'upperInitial', 'upperI

NameError: name 'return_list' is not defined

In [239]:
# when both model and gold indicate a NER tag, and the tags mismatch
report_obj.print_idxlist_to_textlists(idx_list=report_obj.mismatch_ner_idx, return_indices=False)

indices counts = 723

ID 2
FEATURES:   "leicestershire", NNP, allCaps
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', 'cricket', '-', 'leicestershire', 'take', 'over', 'at', 'top']
PoS window  ['<s>', '<s>', 'NNP', ':', 'NNP', 'NNP', 'IN', 'NNP', 'NNP']
Caps window ['<s>', '<s>', 'allCaps', 'noinfo', 'allCaps', 'allCaps', 'allCaps', 'allCaps', 'allCaps']

ID 13
FEATURES:   "west", NNP, upperInitial
Gold NER    ['I-MISC']
Pred NER    ['I-LOC']
Text window ['<s>', '<s>', '<s>', '<s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons']
PoS window  ['<s>', '<s>', '<s>', '<s>', 'NNP', 'NNP', 'NN', 'NNP', 'NNP']
Caps window ['<s>', '<s>', '<s>', '<s>', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial']

ID 27
FEATURES:   "somerset", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window ['friday', 'as', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and']
PoS window  ['NNP', 'IN', 'NNP', 'VBD', 'NNP', 'IN', 'DT'

Text window ['liechtenstein', 'by', 'beating', 'the', '<unk>', '<unk>', 'DG-DG', 'in', 'a']
PoS window  ['NNP', 'IN', 'VBG', 'DT', 'NNP', 'NNS', 'CD', 'IN', 'DT']
Caps window ['upperInitial', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'noinfo', 'noinfo', 'lowercase', 'lowercase']

ID 27286
FEATURES:   "<unk>", JJ, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-ORG']
Text window ["'s", 'visit', 'to', 'the', '<unk>', 'stadium', 'DGDG', 'months', 'ago']
PoS window  ['POS', 'NN', 'TO', 'DT', 'JJ', 'NN', 'CD', 'NNS', 'RB']
Caps window ['noinfo', 'lowercase', 'lowercase', 'lowercase', 'upperInitial', 'lowercase', 'noinfo', 'lowercase', 'lowercase']

ID 27334
FEATURES:   "liechtenstein", NNP, upperInitial
Gold NER    ['I-LOC']
Pred NER    ['I-ORG']
Text window ['republic', 'of', 'ireland', 'beat', 'liechtenstein', 'DG-DG', '(', 'halftime', 'DG-DG']
PoS window  ['NNP', 'IN', 'NNP', 'VBD', 'NNP', 'CD', '(', 'NN', 'JJ']
Caps window ['upperInitial', 'lowercase', 'upperInitial', 'l

Text window ['chips', 'like', '<unk>', ',', 'bangladesh', '<unk>', ',', '<unk>', 'cement']
PoS window  ['NNS', 'IN', 'NNP', ',', 'NNP', 'NNP', ',', 'NNP', 'NNP']
Caps window ['lowercase', 'lowercase', 'allCaps', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial']

ID 51299
FEATURES:   "<unk>", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window ['like', '<unk>', ',', 'bangladesh', '<unk>', ',', '<unk>', 'cement', 'and']
PoS window  ['IN', 'NNP', ',', 'NNP', 'NNP', ',', 'NNP', 'NNP', 'CC']
Caps window ['lowercase', 'allCaps', 'noinfo', 'upperInitial', 'upperInitial', 'noinfo', 'upperInitial', 'upperInitial', 'lowercase']

ID 51304
FEATURES:   "atlas", NNP, upperInitial
Gold NER    ['I-ORG']
Pred NER    ['I-LOC']
Text window [',', '<unk>', 'cement', 'and', 'atlas', 'bangladesh', 'were', 'expected', 'to']
PoS window  [',', 'NNP', 'NNP', 'CC', 'NNP', 'NNP', 'VBD', 'VBN', 'TO']
Caps window ['noinfo', 'upperInitial', 'upperInitial', 'lowe

NameError: name 'return_list' is not defined

In [240]:
report_obj.gold_pred_idx_dict # dictionary[gold_label][prediction_label] --> data indices

defaultdict(<function evaluation_helper.EvalDev_Report.get_gold_pred_idx_dict.<locals>.<lambda>()>,
            {3: defaultdict(list,
                         {3: array([    0,     1,     3, ..., 51357, 51358, 51361]),
                          4: array([ 3768,  4673,  7027,  8687,  9358, 11144, 11393, 11571, 12038,
                                 12877, 13964, 14422, 14693, 17451, 18681, 19431, 20193, 21395,
                                 21735, 23921, 25695, 25829, 28205, 28828, 32072, 32344, 34293,
                                 40273, 40333, 40395, 40434, 40445, 40454, 40580, 40616, 40749,
                                 48087, 48116, 48286, 48287]),
                          5: array([  685,  1506,  2993,  4577,  6147,  7489,  7581,  7607,  8188,
                                  9402,  9406,  9436,  9457, 12996, 12997, 13000, 13154, 13169,
                                 14095, 14475, 15786, 17165, 17431, 20124, 20175, 20212, 20660,
                                 20715, 

## Ignore this section

In [10]:
# !
# Get decoder Y -- 50 dim embedding of center word

train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]

In [11]:
# Get X pos tags

# encoding 1-hot for pos tags
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])

trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)

In [12]:
# Get X capitlization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])

trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)