# Random Tag V Measure Baseline Performance

In [2]:
import spacy
import codecs
import truecase

import numpy as np 

from tokenizations import get_alignments
from transformers import BertTokenizer
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import v_measure_score

In [3]:
def read_data(data_path='../../data/news/'):
    """Read 20news data, test only"""
    # use the cased data for NER, otherwise spacy does not work with uncased 
    with codecs.open(data_path + '20news.txt', encoding='utf-8') as fd:
        data = fd.readlines()
    train_idx = np.load(data_path + 'test_idx.npy')
    train_data = [data[i][: -1] for i in train_idx]
    return train_data

In [4]:
test_data = read_data()

In [8]:
nlp = spacy.load("en_core_web_sm", disable='parser')

In [97]:
pos_tags = []
ent_tags = []
ent_dict = {}
spacy_tokenized = []

for s in tqdm(test_data):
    # doc = nlp(truecase.get_true_case(s))
    doc = nlp(s)

    for token in doc:
        spacy_tokenized.append(token.text)
        pos_tags.append(token.pos)
        ent_tags.append(token.ent_type)
            
    for ent in doc.ents:
        if(ent.label_ not in ent_dict): ent_dict[ent.label_] = [ent.text]
        else: ent_dict[ent.label_].append(ent.text)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52285/52285 [05:29<00:00, 158.50it/s]


In [76]:
Counter(pos_tags)

Counter({93: 189,
         97: 1168,
         95: 658,
         100: 810,
         90: 593,
         92: 1379,
         85: 678,
         84: 496,
         89: 184,
         96: 461,
         87: 515,
         86: 299,
         94: 224,
         99: 31,
         98: 197,
         101: 34,
         91: 34})

In [77]:
Counter(ent_tags)

Counter({397: 112,
         0: 7438,
         381: 20,
         383: 101,
         391: 102,
         380: 72,
         385: 5,
         395: 16,
         393: 7,
         384: 32,
         396: 8,
         394: 18,
         386: 6,
         390: 5,
         392: 8})

## Random Pos Tag, V Measure score

In [98]:
N = 2000

In [99]:
random_pos_tags = np.random.randint(0, N, len(pos_tags))
v_measure_score(random_pos_tags, np.array(pos_tags))

0.0038096341051679153

In [100]:
N = 10000

In [101]:
random_pos_tags = np.random.randint(0, N, len(pos_tags))
v_measure_score(random_pos_tags, np.array(pos_tags))

0.016255415736912696

## Random Entity type, V Measure Score

In [102]:
N = 2000
random_ent_tags = np.random.randint(0, N, len(ent_tags))
v_measure_score(random_ent_tags, np.array(ent_tags))

0.0041198939587344255

In [103]:
N = 10000
random_ent_tags = np.random.randint(0, N, len(ent_tags))
v_measure_score(random_ent_tags, np.array(ent_tags))

0.011516627813481279