In [1]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

from flair.embeddings import WordEmbeddings, StackedEmbeddings

from flair.trainers import ModelTrainer

from flair.models import SequenceTagger

In [2]:
# define columns
columns = {0: 'text', 1: 'bio'}

# this is the folder in which train, test and dev files reside
data_folder = '../corpora/corpus_bio/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2020-11-19 16:51:04,370 Reading data from ../corpus_bio
2020-11-19 16:51:04,371 Train: ../corpus_bio/train.txt
2020-11-19 16:51:04,372 Dev: ../corpus_bio/dev.txt
2020-11-19 16:51:04,373 Test: ../corpus_bio/test.txt


In [3]:
print(corpus.train[300].to_tagged_string('bio'))
print(corpus.test[700].to_tagged_string('bio'))

For example , although breaststroke <B> does not encourage uneven <B> development <I> in terms of the demands of the activity , side <B> dominance <I> causes asymmetries <B> in the leg <B> kicks <I> of most breaststroke <B> swimmers <I> .
As shown in table 1 , the difference in swimming <B> speed <I> is so great that we can say , that there is an actual difference in swimming <B> speed <I> at different fixed points during the race .


In [4]:
# 2. what tag do we want to predict?
tag_type = 'bio'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('glove')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('../resources/taggers/bio-term-tagger',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=40)


2020-11-19 17:37:33,778 epoch 28 - iter 33/334 - loss 2.47325195 - samples/sec: 119.30 - lr: 0.050000
2020-11-19 17:37:41,516 epoch 28 - iter 66/334 - loss 2.32147908 - samples/sec: 136.53 - lr: 0.050000
2020-11-19 17:37:49,705 epoch 28 - iter 99/334 - loss 2.27808187 - samples/sec: 128.99 - lr: 0.050000
2020-11-19 17:37:58,911 epoch 28 - iter 132/334 - loss 2.24194672 - samples/sec: 114.75 - lr: 0.050000
2020-11-19 17:38:07,113 epoch 28 - iter 165/334 - loss 2.24534475 - samples/sec: 128.82 - lr: 0.050000
2020-11-19 17:38:14,754 epoch 28 - iter 198/334 - loss 2.22720077 - samples/sec: 138.26 - lr: 0.050000
2020-11-19 17:38:24,049 epoch 28 - iter 231/334 - loss 2.22305516 - samples/sec: 113.64 - lr: 0.050000
2020-11-19 17:38:35,258 epoch 28 - iter 264/334 - loss 2.20503764 - samples/sec: 94.24 - lr: 0.050000
2020-11-19 17:38:45,952 epoch 28 - iter 297/334 - loss 2.20034387 - samples/sec: 98.77 - lr: 0.050000
2020-11-19 17:38:55,551 epoch 28 - iter 330/334 - loss 2.20274915 - samples/s

{'test_score': 0.9385,
 'dev_score_history': [0.9008,
  0.9085,
  0.9075,
  0.902,
  0.911,
  0.9151,
  0.9152,
  0.9137,
  0.9181,
  0.9225,
  0.9127,
  0.9209,
  0.9244,
  0.9264,
  0.906,
  0.9294,
  0.93,
  0.925,
  0.9314,
  0.9304,
  0.9278,
  0.9348,
  0.9314,
  0.9275,
  0.9342,
  0.9275,
  0.9324,
  0.9382,
  0.9369,
  0.9387,
  0.9364,
  0.9361,
  0.9397,
  0.9359,
  0.9367,
  0.9386,
  0.938,
  0.9405,
  0.9412,
  0.9403],
 'train_loss_history': [4.625888617809661,
  3.5387630983740985,
  3.3446826060375052,
  3.23317875440963,
  3.132369006822209,
  3.0197275045389187,
  2.9675051152349234,
  2.911870054142204,
  2.8589870612064523,
  2.791200451508254,
  2.754865490747783,
  2.745086188801748,
  2.697302406954908,
  2.6351195518485087,
  2.627483014931936,
  2.5645679976501152,
  2.534088355695416,
  2.501499856660466,
  2.501496576263519,
  2.4707000865907727,
  2.438611029150957,
  2.4017514341962554,
  2.3867191825798173,
  2.3732663874854585,
  2.365145576571276,
  2.3