# Experimentos utilizando a rede BiLSTM-CRF com o framework FlairNLP

Os experimentos à seguir avaliam a Influencia do uso de Embeddings no Reconhecimento de Entidades Nomeadas para o Portugues, utilizando os Corpora Multi_WikiNER, LeNER_br e PL-Corpus e os embeddings Pt-Wiki-Fasttext, Flair Embeddings e BERTimbau.

Baseado nos tutoriais do flairNLP
https://github.com/flairNLP/flair

In [None]:
## Instalando o Flair e o Transformers
%%capture
!pip install flair transformers

# Corpus Multi_WikiNER

## Vetor Estático Pt-Wiki-Fasttext


### Imports

In [None]:
## Imports

## Corpus
from flair.datasets import NER_MULTI_WIKINER

## Embeddings
from flair.embeddings import WordEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Corpus
# 1. get the corpus
corpus = NER_MULTI_WIKINER()
print(corpus)

## Tarefa
# 2. what label do we want to predict?
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Embeddings
# 4. initialize embedding
embeddings = WordEmbeddings('pt')

### Treino

In [None]:
## Inicializando o modelo
# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

## Vetor de Contexto Flair Embeddings


### Imports

In [None]:
## Imports

## Corpus
from flair.datasets import NER_MULTI_WIKINER

## Importando os Embeddings, Flair-pt
from flair.embeddings import FlairEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Corpus
# 1. get the corpus
corpus = NER_MULTI_WIKINER()
print(corpus)

## Tarefa
# 2. what label do we want to predict?
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## 4. Stacked Embeddings
from flair.embeddings import StackedEmbeddings

# init Flair embeddings
flair_embedding_forward = FlairEmbeddings('pt-forward')
flair_embedding_backward = FlairEmbeddings('pt-backward')

# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
embeddings = StackedEmbeddings([
                                        flair_embedding_forward,
                                        flair_embedding_backward,
                                       ])

### Treino

In [None]:
## Inicializando o modelo
# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Montando o Drive (Em virtude do longo tempo de treinamento pode ser necessário segmentar o treinamento em etapas, por essa razão é utilizado o drive para carregar o checkpoint)
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Flair_NLP/sota-ner-flair'

In [None]:
## Treinando o modelo
# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train(path,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10,
              checkpoint=True)

In [None]:
## Continuando o treinamento
trainer = ModelTrainer(tagger, corpus)

# 8. continue training at later point. Load previously trained model checkpoint, then resume
trained_model = SequenceTagger.load(path + '/checkpoint.pt')

# resume training best model, but this time until new max-epochs
trainer.resume(trained_model,
               base_path=path + '-resume',
               max_epochs=65,
               checkpoint=True,
               )

## Vetores Estático e de Contexto concatenados (Pt-Wiki-Fastext e Flair Embeddings)

### Imports

In [None]:
## Imports

## Corpus
from flair.datasets import NER_MULTI_WIKINER

## Embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Corpus
# 1. get the corpus
corpus = NER_MULTI_WIKINER()
print(corpus)

## Tarefa
# 2. what label do we want to predict?
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## 4. Stacked Embeddings
# Initialize embedding stack with
embedding_types = [
    WordEmbeddings('pt'),
    FlairEmbeddings('pt-forward'),
    FlairEmbeddings('pt-backward')
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

### Treino

In [None]:
## Inicializando o modelo
# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Montando o Drive (Em virtude do longo tempo de treinamento pode ser necessário segmentar o treinamento em etapas, por essa razão é utilizado o drive para carregar o checkpoint)
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Flair_NLP/sota-ner-flair'

In [None]:
## Treinando o modelo
# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train(path,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10,
              checkpoint=True)

In [None]:
## Continuando o treinamento
trainer = ModelTrainer(tagger, corpus)

# 8. continue training at later point. Load previously trained model checkpoint, then resume
trained_model = SequenceTagger.load(path + '/checkpoint.pt')

# resume training best model, but this time until new max-epochs
trainer.resume(trained_model,
               base_path=path + '-resume',
               max_epochs=50,
               checkpoint=True,
               )

## Vetor de Contexto BERTimbau


### Imports

In [None]:
## Imports

## Corpus
from flair.datasets import NER_MULTI_WIKINER

## Importando os Embeddings, BERTinbaum e Flair-pt
from flair.embeddings import TransformerWordEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Corpus
# 1. get the corpus
corpus = NER_MULTI_WIKINER() #.downsample(0.8)
print(corpus)

## Tarefa
# 2. what label do we want to predict?
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Apenas Bert
embeddings = TransformerWordEmbeddings('neuralmind/bert-base-portuguese-cased')

### Treino

In [None]:
## Inicializando o modelo
# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=30)

# Corpus LeNER_br

## Vetor Estático Pt-Wiki-Fasttext


### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import WordEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/Lener_br/Orig'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Embeddings
# Initialize embedding
embeddings = WordEmbeddings('pt')

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)

# Start training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

## Vetor de Contexto Flair Embeddings


### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import FlairEmbeddings, StackedEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/Lener_br/Orig'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Empilhando os Embeddings
# init Flair embeddings
flair_embedding_forward = FlairEmbeddings('pt-forward')
flair_embedding_backward = FlairEmbeddings('pt-backward')

# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
embeddings = StackedEmbeddings([
                                        flair_embedding_forward,
                                        flair_embedding_backward,
                                       ])

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)

# Start training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

## Vetores Estático e de Contexto concatenados (Pt-Wiki-Fastext e Flair Embeddings)


### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/Lener_br/Orig'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Stacked Embeddings
# Initialize embedding stack with
embedding_types = [
    WordEmbeddings('pt'),
    FlairEmbeddings('pt-forward'),
    FlairEmbeddings('pt-backward')
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)

# Start training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

# Corpus PL-Corpus

## Vetor Estático Pt-Wiki-Fasttext


### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import WordEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/pl_corpus_categoria'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Embeddings
# Initialize embedding
embeddings = WordEmbeddings('pt')

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)
path = 'resources/taggers/sota-ner-flair'

# Start training
trainer.train(path,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

## Vetor de Contexto Flair Embeddings


### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import FlairEmbeddings, StackedEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/pl_corpus_categoria'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Stacked Embeddings
# Initialize embedding stack with
embedding_types = [
    FlairEmbeddings('pt-forward'),
    FlairEmbeddings('pt-backward')
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)
path = 'resources/taggers/sota-ner-flair'

# Start training
trainer.train(path,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)

## Vetores Estático e de Contexto concatenados (Pt-Wiki-Fastext e Flair Embeddings)

### Imports

In [None]:
## Importes
## datasets
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## Embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

## Modelo/Treino
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

### Corpus

In [None]:
## Montando o Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## carregando um corpus e definindo as colunas
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/Flair_NLP/Corpus/pl_corpus_categoria'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')

## Tarefa
label_type = 'ner'

In [None]:
## Dicionário de rótulos
# Make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

### Embeddings

In [None]:
## Stacked Embeddings
# Initialize embedding stack with
embedding_types = [
    WordEmbeddings('pt'),
    FlairEmbeddings('pt-forward'),
    FlairEmbeddings('pt-backward')
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

### Treino

In [None]:
## Inicializando o modelo
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
## Treinando o modelo
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)
path = 'resources/taggers/sota-ner-flair'

# Start training
trainer.train(path,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=100)