# Tutorial 9: Training your own Flair Embeddings

## A. Preparing a Text Corpus

In [0]:
# corpus/
# corpus/train/
# corpus/train/train_split_1
# corpus/train/train_split_2
# corpus/train/...
# corpus/train/train_split_X
# corpus/test.txt
# corpus/valid.txt

## B. Training the Language Model

In [3]:
!pip install flair



In [4]:
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer
from flair.trainers.language_model_trainer import TextCorpus

In [5]:
# are you training a forward or backward LM?
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

2019-12-20 18:07:52,687 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/common_characters not found in cache, downloading to /tmp/tmp57zc9qk0


100%|██████████| 2887/2887 [00:00<00:00, 1369326.66B/s]

2019-12-20 18:07:53,347 copying /tmp/tmp57zc9qk0 to cache at /root/.flair/datasets/common_characters
2019-12-20 18:07:53,347 removing temp file /tmp/tmp57zc9qk0





In [0]:
# get your corpus, process forward and at the character level
# corpus = TextCorpus('/path/to/your/corpus',
#                     dictionary,
#                     is_forward_lm,
#                     character_level = True)

# instantiate your language model, set hidden size and 
# number of layers
# language_model = LanguageModel(dictionary,
#                                is_forward_lm,
#                                hidden_size = 128,
#                                nlayers = 1)

In [0]:
# train your language model
# trainer = LanguageModelTrainer(language_model, corpus)

# trainer.train('resources/taggers/language_model',
#               sequence_length = 10,
#               mini_batch_size = 10,
#               max_epochs = 10)

## C. Using the LM as Embeddings

In [0]:
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings

sentence = Sentence('I love Berlin')

# init embeddings from your trained LM
# char_lm_embeddings = FlairEmbeddings('resources/taggers/language-model/best-lm.pt')

# embed sentence
# char_lm_embeddings.embed(sentence)

## D. Non-Latin Alphabets

In [0]:
# make an empty character dictionary
from flair.data import Dictionary
char_dictionary: Dictionary = Dictionary()

# counter object
import collections
counter = collections.Counter()

processed = 0

In [0]:
import glob
files = glob.glob('/path/to/your/corpus/files/*.*')

print(files)
for file in files:
  print(file)

  with open(file, 'r', encoding = 'utf-8') as f:
    tokens = 0
    for line in f:

      processed += 1
      chars = list(line)
      tokens += len(chars)

      # Add chars to the dictionary
      counter.update(chars)

      # comment this line in to speed things up (if corpus is too large)
      # if tokens > 50000000: break

  # break

total_count = 0
for letter, count in counter.most_common():
  total_count += count

print(total_count)
print(processed)

sum = 0
idx = 0
for letter, count in counter.most_common():
  sum += count
  percentile = (sum / total_count)

  # comment this line in to use only top X percentile of chars
  # otherwise filter later
  # if percentile < 0.00001: break

  char_dictionary.add_item(letter)
  idx += 1
  print('%d\t%s\t%7d\t%7d\t%f' % (idx, letter, count, sum, percentile))

print(char_dictionary.item2idx)


In [0]:
import pickle

with open('/path/to/your_char_mappings', 'wb') as f:
  mappings = {
      'idx2item': char_dictionary.idx2item,
      'item2idx': char_dictionary.item2idx
  }
  pickle.dump(mappings, f)

In [0]:
import pickle

dictionary = Dictionary.load_from_file('/path/to/your_char_mappings')

## E. Fine-Tuning an Existing LM

In [0]:
from flair.data import Dictionary
from flair.embeddings import FlairEmbeddings
from flair.trainers.language_model_trainer import LanguageModelTrainer
from flair.trainers.language_model_trainer import TextCorpus

In [12]:
# instantiate an existing LM, such as one from the FlairEmbeddings
language_model = FlairEmbeddings('news-forward').lm

# are you fine-tuning a forward or backward LM?
is_forward_lm = language_model.is_forward_lm

# get the dictionary from the existing language model
dictionary: Dictionary = language_model.dictionary

2019-12-20 18:27:52,114 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmpxi1j9hxu


100%|██████████| 73034624/73034624 [00:05<00:00, 13068216.18B/s]

2019-12-20 18:27:58,365 copying /tmp/tmpxi1j9hxu to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2019-12-20 18:27:58,436 removing temp file /tmp/tmpxi1j9hxu


In [0]:
# get your corpus, process forward and at the character level
corpus = TextCorpus('path/to/your/corpus',
                    dictionary,
                    is_forward_lm,
                    character_level = True)

In [0]:
# use the model trainer to fine-tune this model on your corpus
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model',
              sequence_length= 100,
              mini_batch_size = 100,
              learning_rate = 20,
              patience = 10,
              checkpoint = True)