In [1]:
!pip install flair



# Best Configurations per Dataset

## I. CoNLL-03 Named Entity Recognition (English)

### a. Data

In [0]:
import os

os.makedirs("/content/conll_03")

In [0]:
## download from
## https://github.com/patverga/torch-ner-nlp-from-scratch/tree/master/data/conll2003

from google.colab import files

uploaded = files.upload()

In [0]:
import shutil

shutil.move("eng.testa", "conll_03")
shutil.move("eng.testb", "conll_03")
shutil.move("eng.train", "conll_03")

In [4]:
from flair.datasets import CONLL_03
from flair.data import Corpus

corpus: Corpus = CONLL_03(base_path = "/content")

2019-12-22 16:00:31,871 Reading data from /content/conll_03
2019-12-22 16:00:31,872 Train: /content/conll_03/eng.train
2019-12-22 16:00:31,873 Dev: /content/conll_03/eng.testa
2019-12-22 16:00:31,874 Test: /content/conll_03/eng.testb


### b. Best Known Configuration

In [0]:
from flair.data import Corpus
from flair.datasets import CONLL_03
from flair.embeddings import TokenEmbeddings, WordEmbeddings
from flair.embeddings import StackedEmbeddings, PooledFlairEmbeddings
from typing import List

In [6]:
# 1. get the corpus

corpus: Corpus = CONLL_03(base_path = "/content").downsample(0.1)

2019-12-22 16:01:02,048 Reading data from /content/conll_03
2019-12-22 16:01:02,050 Train: /content/conll_03/eng.train
2019-12-22 16:01:02,050 Dev: /content/conll_03/eng.testa
2019-12-22 16:01:02,051 Test: /content/conll_03/eng.testb


In [0]:
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type = tag_type)

In [8]:
# 4. initialize embeddings

embedding_types: List[TokenEmbeddings] = [
              # GloVe embeddings                            
              WordEmbeddings('glove'),

              # contextual string embeddings, forward
              PooledFlairEmbeddings('news-forward', pooling = 'min'),

              # contextual string embeddings, backward
              PooledFlairEmbeddings('news-backward', pooling = 'min'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# 5. initialize sequence tagger

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size = 256,
                                        embeddings = embeddings,
                                        tag_dictionary = tag_dictionary,
                                        tag_type = tag_type)

In [0]:
# 6. initialize trainer

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('content/conll_03/example-ner',
              train_with_dev = True,
              max_epochs = 150)

## II. CoNLL-03 Named Entity Recognition (German)

### a. Data

In [0]:
import os

os.makedirs("/content/conll_03_german")

In [18]:
## download from
## https://github.com/MaviccPRP/ger_ner_evals/tree/master/corpora/conll2003

from google.colab import files
uploaded = files.upload()

Saving deu.testa to deu.testa
Saving deu.testb to deu.testb
Saving deu.train to deu.train


In [20]:
import shutil

shutil.move("deu.testa", "conll_03_german")
shutil.move("deu.testb", "conll_03_german")
shutil.move("deu.train", "conll_03_german")

'conll_03_german/deu.train'

### b. Best Known Configuration

In [0]:
from flair.data import Corpus
from flair.datasets import CONLL_03_GERMAN
from flair.embeddings import TokenEmbeddings, WordEmbeddings
from flair.embeddings import StackedEmbeddings, PooledFlairEmbeddings
from typing import List

In [0]:
# 1. get the corpus

corpus: Corpus = CONLL_03_GERMAN(base_path = '/content')

In [0]:
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type = tag_type)

In [0]:
# 4. initialize embeddings

embedding_types: List[TokenEmbeddings] = [
          WordEmbeddings('de'),
          PooledFlairEmbeddings('german-forward'),
          PooledFlairEmbeddings('german-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

In [0]:
# 5. initialize sequence tagger

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size = 256,
                                        embeddings = embeddings,
                                        tag_dictionary = tag_dictionary,
                                        tag_type = tag_type)

In [0]:
# 6. initialize trainer

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('content/conll_03_german/example-ner',
              train_with_dev = True,
              max_epochs = 150)

## III. CoNLL-03 Named Entity Recognition (Dutch)

### a. Data

Data is included in Flair and will get automatically downloaded when you run the script

### b. Best Known Configuration

In [0]:
from flair.data import Corpus
from flair.datasets import CONLL_03_DUTCH
from flair.embeddings import TokenEmbeddings, WordEmbeddings
from flair.embeddings import StackedEmbeddings, PooledFlairEmbeddings
from typing import List

In [28]:
# 1. get the corpus

corpus: Corpus = CONLL_03_DUTCH()

2019-12-22 17:10:26,936 https://www.clips.uantwerpen.be/conll2002/ner/data/ned.testa not found in cache, downloading to /tmp/tmpyoofjmp6


100%|██████████| 450512/450512 [00:00<00:00, 4927734.87B/s]

2019-12-22 17:10:27,120 copying /tmp/tmpyoofjmp6 to cache at /root/.flair/datasets/conll_03_dutch/ned.testa
2019-12-22 17:10:27,122 removing temp file /tmp/tmpyoofjmp6
2019-12-22 17:10:27,215 https://www.clips.uantwerpen.be/conll2002/ner/data/ned.testb not found in cache, downloading to /tmp/tmp021xys8a



100%|██████████| 813815/813815 [00:00<00:00, 6698038.72B/s]

2019-12-22 17:10:27,431 copying /tmp/tmp021xys8a to cache at /root/.flair/datasets/conll_03_dutch/ned.testb
2019-12-22 17:10:27,437 removing temp file /tmp/tmp021xys8a





2019-12-22 17:10:27,530 https://www.clips.uantwerpen.be/conll2002/ner/data/ned.train not found in cache, downloading to /tmp/tmphe68e9r8


100%|██████████| 2375449/2375449 [00:00<00:00, 9154828.27B/s]

2019-12-22 17:10:27,885 copying /tmp/tmphe68e9r8 to cache at /root/.flair/datasets/conll_03_dutch/ned.train
2019-12-22 17:10:27,889 removing temp file /tmp/tmphe68e9r8
2019-12-22 17:10:27,893 Reading data from /root/.flair/datasets/conll_03_dutch
2019-12-22 17:10:27,894 Train: /root/.flair/datasets/conll_03_dutch/ned.train
2019-12-22 17:10:27,895 Dev: /root/.flair/datasets/conll_03_dutch/ned.testa
2019-12-22 17:10:27,896 Test: /root/.flair/datasets/conll_03_dutch/ned.testb





In [0]:
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type = tag_type)

In [31]:
# 4. initialize embeddings

embedding_types: List[TokenEmbeddings] = [
              WordEmbeddings('nl'),
              PooledFlairEmbeddings('dutch-forward', pooling = 'mean'),
              PooledFlairEmbeddings('dutch-backward', pooling = 'mean'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

2019-12-22 17:16:45,406 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-stefan-it/lm-nl-opus-large-forward-v0.1.pt not found in cache, downloading to /tmp/tmpa3eh9ags


100%|██████████| 136162055/136162055 [00:02<00:00, 62961449.76B/s]

2019-12-22 17:16:47,746 copying /tmp/tmpa3eh9ags to cache at /root/.flair/embeddings/lm-nl-opus-large-forward-v0.1.pt





2019-12-22 17:16:47,936 removing temp file /tmp/tmpa3eh9ags
2019-12-22 17:16:48,558 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-stefan-it/lm-nl-opus-large-backward-v0.1.pt not found in cache, downloading to /tmp/tmpk0q1d6qn


100%|██████████| 136162055/136162055 [00:02<00:00, 59715492.02B/s]

2019-12-22 17:16:51,016 copying /tmp/tmpk0q1d6qn to cache at /root/.flair/embeddings/lm-nl-opus-large-backward-v0.1.pt





2019-12-22 17:16:51,263 removing temp file /tmp/tmpk0q1d6qn


In [0]:
# 5. initialize sequence tagger

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size = 256,
                                        embeddings = embeddings,
                                        tag_dictionary = tag_dictionary,
                                        tag_type = tag_type)

In [33]:
# 6. initialize trainer

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
              train_with_dev = True,
              max_epochs = 150)

2019-12-22 17:20:11,535 ----------------------------------------------------------------------------------------------------
2019-12-22 17:20:11,537 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('nl')
    (list_embedding_1): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.1, inplace=False)
          (encoder): Embedding(7632, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=7632, bias=True)
        )
      )
    )
    (list_embedding_2): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.1, inplace=False)
          (encoder): Embedding(7632, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=7632, bias=True)
        )
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout

RuntimeError: ignored

## IV. WNUT-17 Emerging Entity Detection (English)

### a. Data

Data is included in Flair and will get automatically downloaded when you run the script.

### b. Best Known Configuration

In [0]:
from flair.data import Corpus
from flair.datasets import WNUT_17
from flair.embeddings import TokenEmbeddings, WordEmbeddings
from flair.embeddings import StackedEmbeddings, FlairEmbeddings
from typing import List

In [35]:
# 1. get the corpus

corpus: Corpus = WNUT_17()

2019-12-22 17:23:20,155 https://noisy-text.github.io/2017/files/wnut17train.conll not found in cache, downloading to /tmp/tmpeliykho8


100%|██████████| 493781/493781 [00:00<00:00, 23155426.12B/s]

2019-12-22 17:23:20,214 copying /tmp/tmpeliykho8 to cache at /root/.flair/datasets/wnut_17/wnut17train.conll
2019-12-22 17:23:20,216 removing temp file /tmp/tmpeliykho8





2019-12-22 17:23:20,508 https://noisy-text.github.io/2017/files/emerging.dev.conll not found in cache, downloading to /tmp/tmpi8bjtts8


100%|██████████| 114752/114752 [00:00<00:00, 10843861.05B/s]

2019-12-22 17:23:20,550 copying /tmp/tmpi8bjtts8 to cache at /root/.flair/datasets/wnut_17/emerging.dev.conll
2019-12-22 17:23:20,553 removing temp file /tmp/tmpi8bjtts8





2019-12-22 17:23:20,849 https://noisy-text.github.io/2017/files/emerging.test.annotated not found in cache, downloading to /tmp/tmp_ytp13a7


100%|██████████| 192425/192425 [00:00<00:00, 9813586.09B/s]

2019-12-22 17:23:20,899 copying /tmp/tmp_ytp13a7 to cache at /root/.flair/datasets/wnut_17/emerging.test.annotated
2019-12-22 17:23:20,900 removing temp file /tmp/tmp_ytp13a7
2019-12-22 17:23:20,902 Reading data from /root/.flair/datasets/wnut_17
2019-12-22 17:23:20,903 Train: /root/.flair/datasets/wnut_17/wnut17train.conll
2019-12-22 17:23:20,904 Dev: /root/.flair/datasets/wnut_17/emerging.dev.conll
2019-12-22 17:23:20,906 Test: /root/.flair/datasets/wnut_17/emerging.test.annotated





In [0]:
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type = tag_type)

In [37]:
# 4. initialize embeddings

embedding_types: List[TokenEmbeddings] = [
             WordEmbeddings('crawl'),
             WordEmbeddings('twitter'),
             FlairEmbeddings('news-forward'),
             FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

2019-12-22 17:25:45,246 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpjpvybuj7


100%|██████████| 1200000128/1200000128 [00:15<00:00, 78882270.97B/s]

2019-12-22 17:26:00,646 copying /tmp/tmpjpvybuj7 to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2019-12-22 17:26:16,114 removing temp file /tmp/tmpjpvybuj7
2019-12-22 17:26:19,926 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmperc51pct


100%|██████████| 39323680/39323680 [00:00<00:00, 71242883.54B/s]

2019-12-22 17:26:20,633 copying /tmp/tmperc51pct to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M
2019-12-22 17:26:20,671 removing temp file /tmp/tmperc51pct





2019-12-22 17:26:27,182 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim.vectors.npy not found in cache, downloading to /tmp/tmp0295wbnj


100%|██████████| 477405728/477405728 [00:06<00:00, 78672191.36B/s]

2019-12-22 17:26:33,440 copying /tmp/tmp0295wbnj to cache at /root/.flair/embeddings/twitter.gensim.vectors.npy





2019-12-22 17:26:42,817 removing temp file /tmp/tmp0295wbnj
2019-12-22 17:26:43,370 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim not found in cache, downloading to /tmp/tmp450p_mpp


100%|██████████| 68268001/68268001 [00:01<00:00, 38180310.30B/s]

2019-12-22 17:26:45,330 copying /tmp/tmp450p_mpp to cache at /root/.flair/embeddings/twitter.gensim





2019-12-22 17:26:46,375 removing temp file /tmp/tmp450p_mpp


In [0]:
# 5. initialize sequence tagger

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size = 256,
                                        embeddings = embeddings,
                                        tag_dictionary = tag_dictionary,
                                        tag_type = tag_type)

In [0]:
# 6. initialize trainer

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
              train_with_dev = True,
              max_epochs = 150)

2019-12-22 17:28:44,942 ----------------------------------------------------------------------------------------------------
2019-12-22 17:28:44,944 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('crawl')
    (list_embedding_1): WordEmbeddings('twitter')
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_3): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4496, out_features=4496, bias=

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
