# State of the art NER using Flair Sequence Tagger

In [None]:
import os
from itertools import groupby
import itertools
import random
import numpy as np
import math

### Data Preparation
1. Join train, dev, test into single dataset for k-fold cross-validation
2. Create splits with given sizes

For each partition of data with `k` samples, we split the dataset into three sets with the following distribution: `80/10/10` for the train, dev, and test set, respectively. Therefore, for each `k` there will be varying sizes of all three sets of data, however this makes more intuitive sense than only partitioning the test set and keeping the dev and test sets the same size as if they were at `k_n`.

In [None]:
data_path = os.getcwd() + '/resources/tasks/conll_03/full data/'

In [None]:
# Name: Number of documents
# Includes all the docstarts... these are removed by Flair
splits = {'train': {'len': 14987, 'samples': []}, 'dev': {'len': 3466, 'samples': []}, 'test': {'len': 3684, 'samples': []}}

In [None]:
all_data = []
for split_name in splits:
    print(f'Processing: {split_name}')
    
    with open(data_path + split_name + '.txt', 'r') as fr:
        lines = fr.readlines()
    
    # Groups documents based on position of new line separators (lines with no tokens)
    docs = [list(group) for k, group in groupby(lines, lambda x: len(x) == 1) if not k]
    
    print(f'Dataset size (docs): {len(docs)}')
    
    all_data.append(docs)

In [None]:
train_split = 0.01
dev_split = 1#train_split

In [None]:
# Currently using oversized dev/test sets relative to small sample sizes at the beginning of training splits (e.g. 3k for test/dev and %*train_len)
all_data = np.array(list(itertools.chain(*all_data)))
dataset_size = len(all_data)
all_indices = set(np.arange(dataset_size))

for split_name in splits:
    if split_name == 'test':
        # Use entire 3k of test samples
        sampled_indices = np.array(random.sample(list(all_indices), k=splits[split_name]['len']))
        
    if split_name == 'dev':
        # Split dev set with the same proportion of it's original dataset ~24% of the training split
        sampled_indices = np.array(random.sample(list(all_indices), k=math.ceil(splits[split_name]['len']*dev_split)))
        
    if split_name == 'train':
        sampled_indices = np.array(random.sample(list(all_indices), k=math.ceil(splits[split_name]['len']*train_split)))
    splits[split_name]['samples'] = sampled_indices
    # Remove sampled indices from dataset
    all_indices = np.setdiff1d(list(all_indices), sampled_indices)

In [None]:
print(f'Size of datasets: Train {len(splits["train"]["samples"])} Dev {len(splits["dev"]["samples"])} Test {len(splits["test"]["samples"])}')

In [None]:
print(f'Train Sample\n{all_data[splits["train"]["samples"][0]]}\nDev Sample\n{all_data[splits["dev"]["samples"][0]]}')

In [None]:
# Save splits back into directory for Corpus reader to find
for split_name in splits:
    print(split_name)
    split_data_str = "\n".join(["".join(doc) for doc in all_data[splits[split_name]['samples']]])
    with open(os.getcwd() + '/resources/tasks/conll_03/' + split_name + '.txt', 'w') as fw:
        fw.write(split_data_str)
        fw.close()

In [None]:
import flair, torch
flair.device = torch.device('cuda:0')

In [None]:
from flair.data import Corpus
from flair.datasets import CONLL_03
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

In [None]:
# 1. get the corpus
corpus: Corpus = CONLL_03(base_path='resources/tasks')

In [None]:
# 2. what tag do we want to predict?
tag_type = 'ner'

In [None]:
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
# initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # GloVe embeddings
    WordEmbeddings('glove'),

    # contextual string embeddings, forward
    PooledFlairEmbeddings('news-forward', pooling='min'),

    # contextual string embeddings, backward
    PooledFlairEmbeddings('news-backward', pooling='min'),
]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

In [None]:
# initialize trainer
from flair.trainers import ModelTrainer
from datetime import datetime

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

train_w_dev = False


#        return {
#             "test_score": final_score,
#             "dev_score_history": dev_score_history,
#             "train_loss_history": train_loss_history,
#             "dev_loss_history": dev_loss_history,
#         }
save_path = f'resources/taggers/example-ner-{train_split*100:0.0f} ' + 'twd0' if train_w_dev else 'twd1'
trainer.train(save_path,
              train_with_dev=train_w_dev,  
              max_epochs=50)   # 150