In [1]:
TRAIN_PATH = "data/en_ewt_nn_train.conll"
DEV_PATH = "data/en_ewt_nn_dev.conll"
TEST_PATH = "data/en_ewt_nn_test.conll"

In [3]:
import codecs

def read_conll_file(file_name):
    """
    read in conll file
    
    :param file_name: path to read from
    :yields: list of words and labels for each sentence
    """
    current_words = []
    current_tags = []

    for line in codecs.open(file_name, encoding='UTF-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')
            word = tok[0]
            tag = tok[1]

            current_words.append(word)
            current_tags.append(tag)
        else:
            if current_words:  # skip empty lines
                yield((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield((current_words, current_tags))

In [9]:
documents = []
doc_labels = []
for words, labels in read_conll_file(TRAIN_PATH):
    documents.append(words)
    doc_labels.append(labels)

In [17]:
for doc, labels in zip(documents[:10], doc_labels[:10]):
    print("Document:")
    print(doc)
    print("\nMatching labels:")
    print(labels, '\n\n')

Document:
['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']

Matching labels:
['B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOCderiv', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOCderiv', 'O', 'O'] 


Document:
['[', 'This', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']

Matching labels:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 


Document:
['DPA', ':', 'Iraqi', 'authorities', 'announced', 'that', 'they', 'had', 'busted', 'up', '3', 'terrorist', 'cells', 'operating', 'in', 'Baghdad', '.']

Matching labels:
['B-ORG', 'O', 'B-LOCderiv', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'] 




In [18]:
from typing import List, Dict, Any

def create_vocabulary(documents: List[List[str]], pad_token: str) -> Dict[str, int]:
    vocab = {pad_token: 0}
    for doc in documents:
        for token in doc:
            vocab[token] = vocab.get(token, len(vocab))

    return vocab

def reverse_dict(collection: Dict[Any, Any]) -> Dict[Any, Any]:
    reverse = {}
    for k, v in collection.items():
        reverse[v] = k
        
    return reverse

In [19]:
PAD = '<PAD>'

word2idx = create_vocabulary(documents=documents, pad_token=PAD)
idx2word = reverse_dict(collection=word2idx)
label2idx = create_vocabulary(documents=doc_labels, pad_token=PAD)
idx2label = reverse_dict(collection=label2idx)

print("word2idx len:", len(word2idx))
print("idx2word len:", len(idx2word))
print("label2idx len:", len(label2idx))
print("idx2label len:", len(idx2label))

word2idx len: 19670
idx2word len: 19670
label2idx len: 24
idx2label len: 24
