# Evaluation Scripts Development

In [30]:
import spacy
import codecs
import torch 

import numpy as np 

from tokenizations import get_alignments
from transformers import BertTokenizer, BertConfig, BertModel
from tqdm import tqdm
from collections import Counter

import sys 
sys.path.append('..')
from frtorch import LinearChainCRF
from data_utils import News20Data

In [86]:
from sklearn.metrics import v_measure_score

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def read_data(data_path='../../data/news/'):
    """Read 20news data, train only"""
    # use the cased data for NER, otherwise spacy does not work with uncased 
    with codecs.open(data_path + '20news.txt', encoding='utf-8') as fd:
        data = fd.readlines()
    idx = np.load(data_path + 'dev_idx.npy')
    data = [data[i][: -1] for i in idx]
    return data

dev_data = read_data()

## Get tag from Spacy

In [4]:
nlp = spacy.load("en_core_web_sm", disable='parser')

In [7]:
pos_tags = []
ent_tags = []
spacy_tokenized = []

for s in tqdm(dev_data):
    doc = nlp(s)
    tokens = []
    pos = []
    ent = []
    for token in doc:
        tokens.append(token.text)
        pos.append(token.pos)
        ent.append(token.ent_type)
        
    spacy_tokenized.append(tokens)
    pos_tags.append(pos)
    ent_tags.append(ent)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26141/26141 [02:52<00:00, 151.18it/s]


## Load model


In [5]:
ckpt_path = '/home/s1946695/Scale-CRF-Latent-Space/models/bertnet_0.0.6.1/ckpt-e16.pt'
ckpt = torch.load(ckpt_path)
state_matrix = ckpt['state_matrix'].to('cuda')
bert_config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to('cuda')
crf = LinearChainCRF()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Use model to infer latent tags

In [6]:
s = dev_data[100]
inputs = tokenizer(s, return_tensors='pt')

In [7]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

['[CLS]',
 '8',
 ',',
 'st',
 'louis',
 ',',
 'shan',
 '##aha',
 '##n',
 '51',
 '(',
 'emerson',
 ')',
 '19',
 ':',
 '38',
 '.',
 '[SEP]']

In [8]:
inputs

{'input_ids': tensor([[  101,  1022,  1010,  2358,  3434,  1010, 17137, 23278,  2078,  4868,
          1006, 12628,  1007,  2539,  1024,  4229,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
x_emb = bert(inputs['input_ids'].to('cuda'), attention_mask=inputs['attention_mask'].to('cuda'))[0]

In [10]:
with torch.no_grad():
    transition = torch.matmul(state_matrix, state_matrix.transpose(1, 0))
    emission = torch.matmul(x_emb, state_matrix.transpose(1, 0))
    lens = inputs['attention_mask'].to('cuda').sum(-1)
    tags, _, s, bp, log_potentials = crf.argmax(transition, emission, lens)
    _, _, _, z_sample, _, _, _ = crf.rsample_approx(state_matrix, emission, lens, sum_size=50, proposal='softmax')

In [52]:
emission[0, 0].max()

tensor(6.2374, device='cuda:0')

In [86]:
s[0, 1]

tensor([12.7840, 13.9860, 12.5139,  ..., 13.6161, 12.7051, 14.9294],
       device='cuda:0')

In [87]:
s[0, 1].argsort(descending=True)

tensor([ 470, 1662,  533,  ..., 1371, 1174,  631], device='cuda:0')

In [88]:
s[0, 1, s[0, 1].argsort(descending=True)]

tensor([77.6142, 50.8637, 49.3307,  ..., 11.1462, 11.1028, 10.7002],
       device='cuda:0')

In [95]:
s_ = (s[:, 0].unsqueeze(2) + log_potentials[:, 1]).max(1)[0]

In [97]:
s_.argmax()

tensor(470, device='cuda:0')

In [79]:
log_potentials[0, 1, :, 470]

tensor([-8.0679, -9.0833, -8.5773,  ..., -8.1764, -8.6579, -5.2877],
       device='cuda:0')

In [35]:
emission.argmax(-1)

tensor([[1565, 1575, 1246,  380,  638, 1246,  648,  346,  752,  896, 1838,  638,
         1924,  443, 1652, 1977, 1879,  184]], device='cuda:0')

In [11]:
tags

tensor([[470, 470, 470, 470, 470, 470, 470, 470, 470, 470, 470, 470, 470, 470,
         470, 470, 470, 470]], device='cuda:0')

In [12]:
z_sample

tensor([[1565,   55, 1246,  736,  638, 1246,  648,  648,  752,  896, 1838, 1565,
         1924,  443, 1857, 1418,  228,  151]], device='cuda:0')

## Align Latent BERT Tokenization with Spacy Tokenization

In [13]:
s = dev_data[100]
doc = nlp(s)
spacy_tokenized = []
bert_tokenized = []
for token in doc:
    spacy_tokenized.append(token.text)
    bert_tokenized = tokenizer.tokenize(s)
    bert2spacy, spacy2bert = get_alignments(bert_tokenized, spacy_tokenized)

In [17]:
bert_tokenized

['8',
 ',',
 'st',
 'louis',
 ',',
 'shan',
 '##aha',
 '##n',
 '51',
 '(',
 'emerson',
 ')',
 '19',
 ':',
 '38',
 '.']

In [18]:
spacy_tokenized

['8',
 ',',
 'st',
 'louis',
 ',',
 'shanahan',
 '51',
 '(',
 'emerson',
 ')',
 '19',
 ':',
 '38',
 '.']

In [16]:
bert2spacy

[[0],
 [1],
 [2],
 [3],
 [4],
 [5],
 [5],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13]]

In [178]:
bert_tokenized_all = []
spacy_tokenized_all = []
ent_tags_all = []
pos_tags_all = []
pos_fine_tags_all = []
bert_to_spacy_all = []
spacy_to_bert_all = []

pos_word_dict = {}
pos_fine_word_dict = {}
ent_word_dict = {}

id_to_pos = {}
id_to_fine_pos = {}
id_to_ent = {}
for s in tqdm(dev_data):
    s_bert = tokenizer(s)
    if(len(s_bert['input_ids']) <= 2): 
        # print('!')
        # print(s)
        continue # pass empty strings
    
    doc = nlp(s)

    # get pos, ner, token with spacy
    # TODO: need to decide if pos_word_dict need to convert tokenization to BERT -- currently do not convert
    spacy_tokenized = []
    pos_tags = []
    pos_fine_tags = []
    ent_tags = []
    for token in doc:
        if(token.pos_ not in pos_word_dict):
            pos_word_dict[token.pos_] = [token.text]
        else:
            pos_word_dict[token.pos_].append(token.text)
        if(token.tag_ not in pos_fine_word_dict):
            pos_fine_word_dict[token.tag_] = [token.text]
        else:
            pos_fine_word_dict[token.tag_].append(token.text)
        spacy_tokenized.append(token.text)
        pos_tags.append(token.pos)
        id_to_pos[token.pos] = token.pos_
        ent_tags.append(token.ent_type)
        id_to_ent[token.ent_type] = token.ent_type_
        pos_fine_tags.append(token.tag)
        id_to_fine_pos[token.tag] = token.tag_
        
    ent_tags_all.append(ent_tags)
    pos_tags_all.append(pos_tags)
    pos_fine_tags_all.append(pos_fine_tags)

    for ent in doc.ents:
        if(ent.label_ not in ent_word_dict): ent_word_dict[ent.label_] = [ent.text]
        else: ent_word_dict[ent.label_].append(ent.text)

    # get bert tokenization
    bert_tokenized = tokenizer.tokenize(s)

    bert2spacy, spacy2bert = get_alignments(bert_tokenized, spacy_tokenized)
    bert_tokenized_all.append(bert_tokenized)
    spacy_tokenized_all.append(spacy_tokenized)
    bert_to_spacy_all.append(bert2spacy)
    spacy_to_bert_all.append(spacy2bert)
    
for k in pos_word_dict:
    pos_word_dict[k] = Counter(pos_word_dict[k])   
for k in pos_fine_word_dict:
    pos_fine_word_dict[k] = Counter(pos_fine_word_dict[k])
for k in ent_word_dict:
    ent_word_dict[k] = Counter(ent_word_dict[k])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26141/26141 [02:55<00:00, 148.74it/s]


In [144]:
def compute_representative_words(tag_word_dict, thres=0.9):
    tag_word_dict_repr = {}
        
    for k in tag_word_dict:
        total_freq = 0.
        for w in tag_word_dict[k]: total_freq += tag_word_dict[k][w]
        cumsum = 0
        tag_word_dict_repr[k] = {}
        for w in tag_word_dict[k]:
            tag_word_dict_repr[k][w] = tag_word_dict[k][w]
            cumsum += tag_word_dict[k][w]
            if(cumsum / total_freq > thres): break
            # print(cumsum / total_freq)
    return tag_word_dict_repr

In [145]:
pos_word_dict_repr = compute_representative_words(pos_word_dict)
pos_fine_word_dict_repr = compute_representative_words(pos_fine_word_dict)
ent_word_dict_repr = compute_representative_words(ent_word_dict)

In [203]:
pos_word_dict_repr.keys()

dict_keys(['PRON', 'AUX', 'ADJ', 'PUNCT', 'DET', 'NOUN', 'ADP', 'CCONJ', 'ADV', 'VERB', 'NUM', 'PART', 'PROPN', 'SCONJ', 'SYM', 'X', 'INTJ'])

In [146]:
ent_word_dict_repr

{'CARDINAL': {'4': 148,
  '3': 281,
  '70': 5,
  '156': 2,
  '804': 1,
  '38': 16,
  '958': 1,
  '300 fifth': 1,
  '5': 154,
  '1': 369,
  '12': 65,
  'n1': 1,
  '8': 112,
  '51': 6,
  '19': 35,
  '73': 2,
  '331 334': 1,
  'two': 268,
  '82': 5,
  '680x0': 3,
  '40': 18,
  'eight': 6,
  '4 11': 1,
  '3b': 2,
  '28': 14,
  '15430': 1,
  'one': 397,
  '216': 2,
  '368': 2,
  'dozens': 3,
  'thousands': 22,
  '10': 62,
  '05': 5,
  '33': 12,
  'half': 27,
  '508': 3,
  '2': 360,
  '11': 56,
  '13': 31,
  '7': 85,
  'as much as 6': 1,
  '0': 73,
  '215': 2,
  '358': 2,
  '800 753': 1,
  '62': 2,
  'more than 1440k': 1,
  '48': 15,
  '15': 55,
  'three': 101,
  '#': 60,
  '1 6': 2,
  '3 2': 1,
  '6 1': 1,
  '1 2': 2,
  '1 4': 3,
  'four': 44,
  '5 million': 6,
  'six': 14,
  '296': 1,
  '350': 5,
  '43': 10,
  '129': 6,
  '89': 6,
  '602': 4,
  '66mhz': 1,
  '6': 138,
  '185 / 65hr390': 1,
  '9': 62,
  'about an 80': 1,
  '145': 2,
  'more than one': 11,
  '37': 8,
  '117': 4,
  '000': 74,

## Decode Latent Tags

Use sample for now. TODO: update the Viterbi algorithm

In [31]:
dataset = News20Data(data_path='/home/s1946695/RDP/data/news/')
dev_loader = dataset.val_dataloader()

Processing dataset ...
Reading data ...
... 0 seconds
Tokenizing and sorting train data ...
... 66 seconds
Tokenizing dev data ...
... 10 seconds
Tokenizing test data ...
... 19 seconds


In [95]:
latent_tags = []
for batch in tqdm(dev_loader):
    batch_tags = []
    with torch.no_grad():
        x_emb = bert(batch['input_ids'].to('cuda'), attention_mask=batch['attention_mask'].to('cuda'))[0]
        transition = torch.matmul(state_matrix, state_matrix.transpose(1, 0))
        emission = torch.matmul(x_emb, state_matrix.transpose(1, 0))
        lens = batch['attention_mask'].to('cuda').sum(-1)
        # tags, _, s, bp, log_potentials = crf.argmax(transition, emission, lens) # TBC
        _, _, _, z_sample, _, _, _ = crf.rsample_approx(state_matrix, emission, lens, sum_size=50, proposal='softmax')
        z_sample = z_sample.cpu().numpy()
        lens = lens.cpu().numpy()
        for li, l in enumerate(lens):
            latent_tags.append(z_sample[li][1:l-1])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2613/2613 [02:57<00:00, 14.72it/s]


In [96]:
latent_tags_spacy = []
for bert2spacy, tags in tqdm(zip(bert_to_spacy_all, latent_tags)):
    prev_spacy_idx = -1
    tags_converted = []
    assert(len(bert2spacy) == len(tags))
    for bi, (si_, t) in enumerate(zip(bert2spacy, tags)):
        for si in si_:
            assert(si == prev_spacy_idx or si == prev_spacy_idx + 1)
            # if many consequtive BERT token correspond to the same spacy token, 
            # then only use the tag for the first bert token
            if(si == prev_spacy_idx + 1): 
                prev_spacy_idx += 1
                tags_converted.append(t)
    latent_tags_spacy.append(tags_converted)

26121it [00:00, 65428.33it/s]


In [152]:
len(spacy_tokenized)

17

In [153]:
latent_word_dict = {}
for latent, tokens in tqdm(zip(latent_tags_spacy, spacy_tokenized_all)):
    for tag, tok in zip(latent, tokens):
        if(tag not in latent_word_dict): latent_word_dict[tag] = [tok]
        else: latent_word_dict[tag].append(tok)
for k in latent_word_dict: latent_word_dict[k] = Counter(latent_word_dict[k])
latent_word_dict_repr = compute_representative_words(latent_word_dict)

26121it [00:00, 118549.50it/s]


In [154]:
latent_word_dict_repr

{335: {'you': 1047},
 110: {"'re": 27, '.': 1794, 'mine': 2, ')': 307},
 1094: {'right': 257,
  'help': 168,
  'service': 4,
  'house': 1,
  'ok': 8,
  'good': 3,
  'advice': 17,
  'backup': 1,
  'front': 3,
  'rights': 56,
  'input': 2,
  'follow': 1,
  'normal': 3,
  'talent': 3,
  'creation': 1,
  'support': 13,
  'proper': 4,
  'assist': 3,
  'save': 5,
  'righthanded': 1,
  'liberties': 3,
  'power': 32,
  'funds': 1,
  'wrong': 66,
  'insight': 1,
  'thanks': 60,
  'helper': 1,
  'contact': 3,
  'protection': 3,
  'faults': 1,
  'rental': 1,
  'helps': 9,
  'helped': 9,
  'to': 2,
  'hospital': 2,
  'fair': 6,
  '.': 4,
  'powers': 3,
  'she': 1,
  'advocacy': 1,
  'resource': 6,
  'charitable': 1,
  'knowledge': 1,
  "devil'and": 1,
  'connections': 1,
  'correctly': 6,
  'means': 2,
  'credit': 5,
  'helpful': 5,
  'forces': 1,
  'safety': 3,
  'reached': 1,
  'fine': 11,
  'tools': 2,
  'option': 1,
  'values': 2,
  'usefulness': 1,
  'ensure': 1,
  'freedom': 7,
  'yeah': 2,


## Compute V measure

### Inferred Tags

In [97]:
latent_tags_spacy_ = []
for l in latent_tags_spacy: latent_tags_spacy_.extend(l)
ent_tags_all_ = []
for l in ent_tags_all: ent_tags_all_.extend(l)
pos_tags_all_ = []
for l in pos_tags_all: pos_tags_all_.extend(l)
pos_fine_tags_all_ = []
for l in pos_fine_tags_all: pos_fine_tags_all_.extend(l)

In [98]:
v_measure_score(np.array(ent_tags_all_), np.array(latent_tags_spacy_))

0.048434564541327774

In [99]:
v_measure_score(np.array(pos_tags_all_), np.array(latent_tags_spacy_))

0.403168838843741

In [117]:
v_measure_score(np.array(pos_fine_tags_all_), np.array(latent_tags_spacy_))



0.4500332788446801

### Random Tags

In [124]:
random_tags = np.random.randint(0, 2000, len(latent_tags_spacy_))

In [219]:
random_word_dict = {}
for tokens in tqdm(spacy_tokenized_all):
    rdm = np.random.randint(0, 2000, len(tokens))
    for tag, tok in zip(rdm, tokens):
        if(tag not in random_word_dict): random_word_dict[tag] = [tok]
        else: random_word_dict[tag].append(tok)
for k in random_word_dict: random_word_dict[k] = Counter(random_word_dict[k])
random_word_dict_repr = compute_representative_words(random_word_dict)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26121/26121 [00:00<00:00, 29321.51it/s]


In [125]:
v_measure_score(np.array(ent_tags_all_), random_tags)

0.007195408111338528

In [126]:
v_measure_score(np.array(pos_tags_all_), random_tags)

0.007622744195982017

In [127]:
v_measure_score(np.array(pos_fine_tags_all_), random_tags)



0.01914281671838898

## Compute Aligned Latent Tags

### Latent

In [166]:
def align_tags(latent_tags, defined_tags, thres=0.9):
    latent_to_defined = {}
    for l in latent_tags:
        l_occ = 0
        for w in latent_tags[l]: l_occ += latent_tags[l][w]
        for d in defined_tags:
            d_repr_words = set(defined_tags[d].keys())
            overlap = 0
            for w in latent_tags[l]:
                if(w in d_repr_words): overlap += latent_tags[l][w]
            # print(overlap, l_occ, l, d)
            if(overlap / l_occ > thres): 
                latent_to_defined[l] = d
                break
    return latent_to_defined

In [205]:
latent_to_pos = align_tags(latent_word_dict_repr, pos_word_dict_repr)

In [206]:
pos_word_dict_repr.keys()

dict_keys(['PRON', 'AUX', 'ADJ', 'PUNCT', 'DET', 'NOUN', 'ADP', 'CCONJ', 'ADV', 'VERB', 'NUM', 'PART', 'PROPN', 'SCONJ', 'SYM', 'X', 'INTJ'])

In [210]:
latent_to_pos

{335: 'PRON',
 110: 'PUNCT',
 867: 'PUNCT',
 1176: 'PUNCT',
 877: 'PRON',
 667: 'AUX',
 1335: 'PUNCT',
 898: 'ADP',
 1284: 'PRON',
 316: 'CCONJ',
 1309: 'PUNCT',
 271: 'NOUN',
 928: 'AUX',
 343: 'PRON',
 1834: 'PUNCT',
 939: 'VERB',
 843: 'DET',
 1821: 'NOUN',
 81: 'PROPN',
 1575: 'NUM',
 498: 'PUNCT',
 563: 'PUNCT',
 34: 'ADP',
 71: 'DET',
 1929: 'VERB',
 184: 'PUNCT',
 1131: 'PUNCT',
 694: 'PRON',
 1173: 'VERB',
 1852: 'PART',
 1684: 'PUNCT',
 948: 'ADP',
 517: 'ADP',
 1784: 'NOUN',
 1050: 'PUNCT',
 788: 'NOUN',
 1838: 'PUNCT',
 1701: 'PROPN',
 121: 'NOUN',
 1322: 'ADP',
 893: 'PRON',
 1687: 'PUNCT',
 1308: 'PUNCT',
 1105: 'NOUN',
 1717: 'PRON',
 1202: 'DET',
 894: 'PRON',
 734: 'PUNCT',
 1234: 'NOUN',
 1765: 'PUNCT',
 718: 'AUX',
 446: 'PRON',
 670: 'PRON',
 569: 'ADP',
 1785: 'CCONJ',
 310: 'SCONJ',
 1933: 'PRON',
 601: 'AUX',
 1045: 'ADP',
 213: 'PUNCT',
 795: 'PRON',
 1102: 'ADV',
 105: 'NOUN',
 1113: 'PUNCT',
 1223: 'NOUN',
 623: 'NOUN',
 75: 'NOUN',
 1400: 'NOUN',
 228: 'PUNCT'

In [211]:
len(latent_to_pos)

756

In [215]:
latent_to_fine_pos = align_tags(latent_word_dict_repr, pos_fine_word_dict_repr)

In [209]:
latent_to_fine_pos

{335: 'PRP',
 867: ',',
 1176: "''",
 877: 'DT',
 667: 'VBZ',
 1335: "''",
 898: 'IN',
 1284: 'PRP',
 316: 'CC',
 1309: 'NN',
 928: 'VBP',
 343: 'DT',
 1834: 'NN',
 843: 'DT',
 1821: 'NN',
 81: 'NNP',
 1575: 'CD',
 498: ',',
 563: ',',
 34: 'IN',
 71: 'DT',
 184: 'NN',
 1131: '.',
 694: 'PRP',
 1852: 'RB',
 1684: 'NN',
 948: 'IN',
 517: 'IN',
 1784: 'NN',
 1050: 'NN',
 1838: '-LRB-',
 1701: 'NNP',
 121: 'NN',
 1322: 'IN',
 893: 'DT',
 1687: '-LRB-',
 1105: 'IN',
 1717: 'PRP',
 1202: 'DT',
 894: 'DT',
 734: ',',
 718: 'NN',
 446: 'DT',
 670: 'PRP$',
 569: 'IN',
 1785: 'CC',
 310: 'WRB',
 1933: 'DT',
 601: 'VBD',
 1045: 'IN',
 213: ',',
 795: 'DT',
 1102: 'RB',
 105: 'NN',
 1113: 'NN',
 1223: 'NN',
 75: 'NN',
 228: 'NN',
 293: 'NFP',
 1601: 'CC',
 608: 'NNS',
 935: ',',
 132: 'PRP',
 926: 'NNP',
 272: 'DT',
 978: 'IN',
 199: 'RB',
 1870: 'IN',
 1557: 'NN',
 1730: ',',
 1779: 'DT',
 339: 'WDT',
 466: 'DT',
 25: 'IN',
 1760: 'IN',
 463: 'VBZ',
 1441: 'DT',
 42: 'RB',
 229: 'IN',
 380: 'NNP

In [216]:
len(latent_to_fine_pos)

708

In [172]:
latent_to_ent = align_tags(latent_word_dict_repr, ent_word_dict_repr)

In [173]:
len(latent_to_ent)

58

### Random

In [220]:
random_to_pos = align_tags(random_word_dict_repr, pos_word_dict_repr)

In [221]:
len(random_to_pos)

0

In [222]:
random_to_fine_pos = align_tags(random_word_dict_repr, pos_fine_word_dict_repr)

In [223]:
len(random_to_fine_pos)

0

In [224]:
random_to_ent = align_tags(random_word_dict_repr, ent_word_dict_repr)

In [225]:
len(random_to_ent)

0

## Compute Recall

In [179]:
id_to_ent

{0: '',
 397: 'CARDINAL',
 380: 'PERSON',
 383: 'ORG',
 384: 'GPE',
 391: 'DATE',
 392: 'TIME',
 396: 'ORDINAL',
 386: 'PRODUCT',
 381: 'NORP',
 394: 'MONEY',
 387: 'EVENT',
 395: 'QUANTITY',
 9191306739292312949: 'FAC',
 389: 'LANGUAGE',
 393: 'PERCENT',
 385: 'LOC',
 390: 'LAW',
 388: 'WORK_OF_ART'}

In [180]:
id_to_pos

{95: 'PRON',
 87: 'AUX',
 84: 'ADJ',
 97: 'PUNCT',
 90: 'DET',
 92: 'NOUN',
 85: 'ADP',
 89: 'CCONJ',
 86: 'ADV',
 100: 'VERB',
 93: 'NUM',
 94: 'PART',
 96: 'PROPN',
 98: 'SCONJ',
 99: 'SYM',
 101: 'X',
 91: 'INTJ'}

In [182]:
id_to_fine_pos

{13656873538139661788: 'PRP',
 9188597074677201817: 'VBP',
 10554686591937588953: 'JJ',
 2593208677638477497: ',',
 14143520107006108953: "''",
 15267657372422890137: 'DT',
 15308085513773655218: 'NN',
 13927759927860985106: 'VBZ',
 1292078113972184607: 'IN',
 17571114184892886314: 'CC',
 164681854541413346: 'RB',
 12646065887601541794: '.',
 783433942507015291: 'NNS',
 14200088355797579614: 'VB',
 272890857012483650: 'JJR',
 8427216679587749980: 'CD',
 11532473245541075862: ':',
 15794550382381185553: 'NNP',
 17111077179131903759: '-LRB-',
 2465883113906300949: '-RRB-',
 74: 'POS',
 16235386156175103506: 'MD',
 1534113631682161808: 'VBG',
 4062917326063685704: 'PRP$',
 5595707737748328492: 'TO',
 17524233984504158541: 'WRB',
 17109001835818727656: 'VBD',
 3822385049556375858: 'VBN',
 14872845191859177490: 'NFP',
 6860118812490040284: 'RP',
 17202369883303991778: 'WDT',
 15361090031084224697: 'EX',
 99: 'SYM',
 16530679158541427010: 'LS',
 4969857429396651903: '``',
 189557958894700426

In [188]:
ent_to_id = {id_to_ent[i]: i for i in id_to_ent}

In [226]:
pred_ent = 0
recall_ent = 0
prec_ent = 0
for e, l in zip(ent_tags_all_, latent_tags_spacy_):
    if(l in latent_to_ent):
        el = ent_to_id[latent_to_ent[l]]
        prec_ent += 1
        if(el == e and e in id_to_ent and e != 0): 
            pred_ent += 1
    if(e in id_to_ent and e != 0): recall_ent += 1
print('prec', pred_ent / prec_ent)
print('recl', pred_ent / recall_ent)

prec 0.5303521779425394
recl 0.08558609085810431


In [195]:
total_ent

26745

In [227]:
pred_pos = 0
recall_pos = 0
prec_pos = 0
pos_to_id = {id_to_pos[i]: i for i in id_to_pos}
for e, l in zip(pos_tags_all_, latent_tags_spacy_):
    if(l in latent_to_pos):
        el = pos_to_id[latent_to_pos[l]]
        prec_pos += 1
        if(el == e and e in id_to_pos): 
            pred_pos += 1
    if(e in id_to_pos): recall_pos += 1
print('prec', pred_pos / prec_pos)
print('recl', pred_pos / recall_pos)

prec 0.7428899441632312
recl 0.3640937646768876


In [228]:
prec_fine_pos = 0
recl_fine_pos = 0
pred_fine_pos = 0
fine_pos_to_id = {id_to_fine_pos[i]: i for i in id_to_fine_pos}
for e, l in zip(pos_fine_tags_all_, latent_tags_spacy_):
    if(l in latent_to_fine_pos):
        el = fine_pos_to_id[latent_to_fine_pos[l]]
        prec_fine_pos += 1
        if(el == e and e in id_to_fine_pos): 
            pred_fine_pos += 1
    if(e in id_to_fine_pos): recl_fine_pos += 1
print('prec', pred_fine_pos / prec_fine_pos)
print('recl', pred_fine_pos / recl_fine_pos)

prec 0.7198716272964419
recl 0.30091979037335675
