# State of the art NER using Flair Sequence Tagger
See SoTA Experimental setups: https://github.com/flairNLP/flair/blob/master/resources/docs/EXPERIMENTS.md

In [1]:
import os
from itertools import groupby
import itertools
import random
import numpy as np
import math
import json

In [2]:
import flair, torch
flair.device = torch.device('cuda:0')

In [3]:
from flair.data import Corpus
from flair.datasets import CONLL_03
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List
from flair.models import SequenceTagger

from flair.trainers import ModelTrainer
from datetime import datetime

### Data Preparation
1. Join train, dev, test into single dataset for k-fold cross-validation
2. Create splits with given sizes

For each partition of data with `k` samples, we split the dataset into three sets with the following distribution: `80/10/10` for the train, dev, and test set, respectively. Therefore, for each `k` there will be varying sizes of all three sets of data, however this makes more intuitive sense than only partitioning the test set and keeping the dev and test sets the same size as if they were at `k_n`.

`DOCTSTART` has been removed from these documents as it's not required for model learning

In [4]:
def run_seqtag_model(train_split: float, dev_split: float = 1.0, train_w_dev: bool = False, max_epochs: int = 50, runs: int = 5):
    
    for run in range(1, runs+1,1):
        
        # run save path is used to save run data and results
        run_save_path = os.getcwd() + f'/runs/{str(int(train_split*100))}/{run}/'
        
        
        # Prepare data and write to disk
        data_path = os.getcwd() + '/resources/tasks/conll_03/full data/'

        # Name: Number of documents
        # Sample counts DO NOT include DOCSTARTs 
        splits = {'train': {'len': 14041, 'samples': []}, 'dev': {'len': 3250, 'samples': []}, 'test': {'len': 3453, 'samples': []}}
        
        all_data = []
        for split_name in splits:
            print(f'Processing: {split_name}')

            with open(data_path + split_name + '.txt', 'r') as fr:
                lines = fr.readlines()
            
            # Remove doc separator
            # TODO: make more abstract, DOCSTART in CONLL2003 specific
            lines = [line for line in lines if '-DOCSTART-' not in line]
            
            # Groups documents based on position of new line separators (lines with no tokens)
            docs = [list(group) for k, group in groupby(lines, lambda x: len(x) == 1) if not k]
            
            print(f'Dataset size (docs): {len(docs)}')

            all_data.append(docs)

        # Currently using oversized dev/test sets relative to small sample sizes at the beginning of training splits (e.g. 3k for test/dev and %*train_len)
        all_data = np.array(list(itertools.chain(*all_data)))
        dataset_size = len(all_data)
        all_indices = set(np.arange(dataset_size))

        for split_name in splits:
            if split_name == 'test':
                # Use entire 3k of test samples
                sampled_indices = np.array(random.sample(list(all_indices), k=splits[split_name]['len']))

            if split_name == 'dev':
                # Split dev set with the same proportion of it's original dataset ~24% of the training split
                sampled_indices = np.array(random.sample(list(all_indices), k=math.ceil(splits[split_name]['len']*dev_split)))

            if split_name == 'train':
                sampled_indices = np.array(random.sample(list(all_indices), k=math.ceil(splits[split_name]['len']*train_split)))
            splits[split_name]['samples'] = sampled_indices
            # Remove sampled indices from dataset
            all_indices = np.setdiff1d(list(all_indices), sampled_indices)

        print(f'Size of datasets: Train {len(splits["train"]["samples"])} Dev {len(splits["dev"]["samples"])} Test {len(splits["test"]["samples"])}')
        print(f'Train Sample\n{all_data[splits["train"]["samples"][0]]}\nDev Sample\n{all_data[splits["dev"]["samples"][0]]}')
        
        try:
            if os.path.exists(os.path.join(run_save_path, 'conll_03')):
                pass
            else:
                os.mkdir(os.path.join(run_save_path, 'conll_03'))
        except:
            print('Failure to make conll_03 folder')
        
        # Save splits back into directory for Corpus reader to find
        for split_name in splits:
            print(split_name)
            split_data_str = "\n".join(["".join(doc) for doc in all_data[splits[split_name]['samples']]])
            with open(os.getcwd() + '/resources/tasks/conll_03/' + split_name + '.txt', 'w') as fw:
                fw.write(split_data_str)
                fw.close()
            
            # Save a copy to run folder
            with open(run_save_path + 'conll_03/' + split_name + '.txt', 'w') as fw:
                fw.write(split_data_str)
                fw.close()

        # Flair Model Initialisation and Training
        # 1. get the corpus
        corpus: Corpus = CONLL_03(base_path=run_save_path)
        # 2. what tag do we want to predict?
        tag_type = 'ner'
        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        
        print(tag_dictionary)

        # initialize embeddings
        embedding_types: List[TokenEmbeddings] = [

            # GloVe embeddings
            WordEmbeddings('glove'),

            # contextual string embeddings, forward
            PooledFlairEmbeddings('news-forward', pooling='min'),

            # contextual string embeddings, backward
            PooledFlairEmbeddings('news-backward', pooling='min'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

        # initialize sequence tagger
        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type)

        # initialize trainer
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        save_path = f'resources/taggers/example-ner-{train_split*100:0.0f} ' + 'twd0' if train_w_dev else 'twd1'
        results = trainer.train(save_path,
                                train_with_dev=train_w_dev,
                                max_epochs=max_epochs)   # 150
        
        # Save results to run file
        with open(run_save_path + 'results.json', 'w') as fwj:
            json.dump(results, fwj)
    
    return results

In [None]:
splits_to_run = [0.1]
[run_seqtag_model(train_split=split, max_epochs=50, runs=5) for split in splits_to_run]

Processing: train
Dataset size (docs): 14041
Processing: dev
Dataset size (docs): 3250
Processing: test
Dataset size (docs): 3453
Size of datasets: Train 1405 Dev 3250 Test 3453
Train Sample
['Total JJ B-NP O\n', '( ( O O\n', 'for IN B-PP O\n', 'one CD B-NP O\n', 'wicket NN I-NP O\n', ') ) O O\n', '48 CD B-NP O\n']
Dev Sample
['Attendence NN B-NP O\n', ': : O O\n', '200 CD B-NP O\n']
train
dev
test
2020-12-08 05:52:40,720 Reading data from /home/tyler/Desktop/Repos/s-vaal/dev/Flair/runs/10/1/conll_03
2020-12-08 05:52:40,721 Train: /home/tyler/Desktop/Repos/s-vaal/dev/Flair/runs/10/1/conll_03/train.txt
2020-12-08 05:52:40,722 Dev: /home/tyler/Desktop/Repos/s-vaal/dev/Flair/runs/10/1/conll_03/dev.txt
2020-12-08 05:52:40,722 Test: /home/tyler/Desktop/Repos/s-vaal/dev/Flair/runs/10/1/conll_03/test.txt


  all_data = np.array(list(itertools.chain(*all_data)))


Dictionary with 20 tags: <unk>, O, S-LOC, S-ORG, B-PER, E-PER, B-ORG, E-ORG, S-MISC, B-MISC, E-MISC, I-ORG, B-LOC, E-LOC, S-PER, I-MISC, I-LOC, I-PER, <START>, <STOP>
2020-12-08 05:52:50,348 ----------------------------------------------------------------------------------------------------
2020-12-08 05:52:50,348 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (list_embedding_2): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(1

  all_data = np.array(list(itertools.chain(*all_data)))


Dictionary with 20 tags: <unk>, O, S-ORG, S-LOC, B-PER, E-PER, B-MISC, E-MISC, B-ORG, E-ORG, I-MISC, S-MISC, I-ORG, B-LOC, E-LOC, S-PER, I-LOC, I-PER, <START>, <STOP>
2020-12-08 06:19:49,981 ----------------------------------------------------------------------------------------------------
2020-12-08 06:19:49,998 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (list_embedding_2): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(1