In [None]:
import flair, random, torch

from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings, TransformerWordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from torch.optim.lr_scheduler import OneCycleLR


flair.device = torch.device("cuda")
torch.cuda.is_available()

## Prepare dataset

In [None]:
# define columns
columns = {0: "text", 1: "ner"}

# this is the folder in which train, test and dev files reside
data_folder = "/resources/ner/flair/resos-20221116"

# 1. init a corpus using column format, data folder and the names of the train, dev and test files
corpus = ColumnCorpus(
    data_folder,
    columns,
    train_file="train.txt",
    test_file="test.txt",
    dev_file="dev.txt",
)

In [None]:
for i in range(10):
    print(corpus.train[i])

In [None]:
for i in range(10):
    print(corpus.dev[i])

In [None]:
for i in range(10):
    print(corpus.test[i])

In [None]:
# 2. what label do we want to predict?
label_type = "ner"

In [None]:
# 3. make the label dictionary from the corpus
vocab_dictionary = corpus.make_vocab_dictionary()
print(vocab_dictionary)

In [None]:
# 4. make the vocab dictionary from the corpus
label_dictionary = corpus.make_label_dictionary(label_type=label_type)

In [None]:
print(corpus.obtain_statistics())

## Beto

In [None]:
# 5. initialize fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(
    model="dccuchile/bert-base-spanish-wwm-cased",
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
    allow_long_sentences=True
)

In [None]:
# 6. initialize sequence tagger
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dictionary,
    tag_type=label_type,
    use_crf=True,
    use_rnn=True,
    reproject_embeddings=True,
)

In [None]:
# 7. initialize trainer
trainer = ModelTrainer(
    tagger,
    corpus,
)

In [None]:
# 8. run training
trainer.fine_tune(
    "/resources/ner/flair/direct-finetune/",
    learning_rate=5.0e-5,
    mini_batch_size=2,
    mini_batch_chunk_size=1,
    max_epochs=10,
    scheduler=OneCycleLR,
    embeddings_storage_mode="none",
    weight_decay=0.,
    use_final_model_for_eval=False,
)

In [None]:
from flair.visual.training_curves import Plotter

plotter = Plotter()
plotter.plot_training_curves("/resources/ner/flair/direct-finetune/loss.tsv")

In [None]:
# 7. continue training at later point. Load previously trained model checkpoint, then resume
path = "/resources/ner/flair/direct-finetune/"

# load model
trained_model = SequenceTagger.load(path + "model.pt")

# define trainer
trainer = ModelTrainer(
    trained_model,
    corpus,
)

# resume training best model, but this time until epoch 30
trainer.resume(
    trained_model,
    base_path=path + "resume",
    max_epochs=30,
    learning_rate=1.0e-6,
)

## Inference

In [None]:
path = "/resources/ner/flair/direct-finetune/"

# load model
tagger = SequenceTagger.load(path + "model.pt")

In [None]:
for sentence in corpus.test:
    tagger.predict(sentence)
    print(sentence)
    print()
    # print the entities with below command
    for entity in sentence.get_spans('ner'):
        print(entity)
    print("="*5)

## Evaluation

In [None]:
evaluation = tagger.evaluate(
    corpus.test,
    label_type,
    path + "evaluation.txt",
)

In [None]:
print(evaluation.detailed_results)

In [None]:
evaluation.classification_report

In [None]:
evaluation.main_score

In [None]:
evaluation.loss

In [None]:
import re
import pandas as pd
pd.set_option("display.max_rows", 100)


df = pd.read_csv(path+"evaluation.txt", sep=" ", header=None)
df.columns = ["token", "label", "pred"]
df.head()

In [None]:
df["label"].value_counts(normalize=True)

In [None]:
# Exact match
df["match"] = df["label"] == df["pred"]
df["match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] != "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] == "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[(df["label"] == "O") & (df["match"] != 1)]

In [None]:
df["label"].map(lambda x: re.sub(r"B-|I-", "", x)).value_counts(normalize=True)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(df["label"], df["pred"]))

In [None]:
print(classification_report(df["label"].map(lambda x: re.sub(r"B-|I-", "", x)), df["pred"].map(lambda x: re.sub(r"B-|I-", "", x))))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 20))

cm = confusion_matrix(
    df["label"].map(lambda x: re.sub(r"B-|I-", "", x)),
    df["pred"].map(lambda x: re.sub(r"B-|I-", "", x)),
    normalize="true",
)

sns.heatmap(cm, annot=True, fmt=".2f");

In [None]:
# 5. initialize fine-tuneable transformer embeddings WITH document context
beto_embeddings = TransformerWordEmbeddings(
    model="dccuchile/bert-base-spanish-wwm-cased",
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
    allow_long_sentences=True
)

# init Flair forward and backwards embeddings
flair_embeddings_forward = FlairEmbeddings('es-forward')
flair_embeddings_backward = FlairEmbeddings('es-backward')

# create a StackedEmbedding object that combines beto and forward/backward flair embeddings
stacked_embeddings = StackedEmbeddings(
    [
        beto_embeddings,
        flair_embeddings_forward,
        flair_embeddings_backward,
    ]
)

In [None]:
# 6. initialize sequence tagger
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=stacked_embeddings,
    tag_dictionary=label_dictionary,
    tag_type=label_type,
    use_crf=True,
    use_rnn=True,
    reproject_embeddings=True,
)

In [None]:
# 7. initialize trainer
trainer = ModelTrainer(
    tagger,
    corpus,
)

In [None]:
# 8. run training
trainer.fine_tune(
    "/resources/ner/flair/custom/model-stacked",
    learning_rate=5.0e-5,
    mini_batch_size=2,
    mini_batch_chunk_size=1,
    max_epochs=10,
    scheduler=OneCycleLR,
    embeddings_storage_mode="none",
    weight_decay=0.,
    use_final_model_for_eval=False,
)

In [None]:
from flair.visual.training_curves import Plotter

plotter = Plotter()
plotter.plot_training_curves("/resources/ner/flair/custom/model-stacked/loss.tsv")
# plotter.plot_weights("/resources/ner/flair/custom/model/weights.txt")

In [None]:
path

In [None]:
# load model
best_model =  SequenceTagger.load(path + "model.pt")

In [None]:
for sentence in corpus.test:
    # print(sentence)
    # predict tags and print
    best_model.predict(sentence)
    print(sentence)
    # for entity in sentence.get_spans('ner'):
    #     print(entity)
    print()

In [None]:
# create example sentence
sentence = Sentence('El juez de la Cámara Federal porteña envió una carta a sus colegas en las que les reprochó el silencio ante las críticas de Alberto Fernández por el fallo que liberó a Revolución Federal.')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

In [None]:
# print the entities with below command
for entity in sentence.get_spans('ner'):
    print(entity)

In [None]:
from hyperopt import hp
from flair.hyperparameter.param_selection import (
    OptimizationValue,
    Parameter,
    SearchSpace,
    SequenceTaggerParamSelector,
)

In [None]:
# define your search space
search_space = SearchSpace()

search_space.add(
    Parameter.EMBEDDINGS,
    hp.choice,
    options=[
        TransformerWordEmbeddings(
            model='dccuchile/bert-base-spanish-wwm-cased',
            layers="-1",
            subtoken_pooling="first",
            fine_tune=True,
            use_context=True,
        ),
    ]
)

search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
search_space.add(Parameter.USE_RNN, hp.choice, options=[True, False])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[2])
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[5.0e-5])
search_space.add(Parameter.WEIGHT_DECAY, hp.choice, options=[0])

In [None]:
# create the parameter selector
param_selector = SequenceTaggerParamSelector(
    corpus,
    label_type,
    '/resources/ner/flair/hyperopt',
    # training_runs=3,
    max_epochs=5,
    optimization_value=OptimizationValue.DEV_SCORE,
)

In [None]:
!rm -r /resources/ner/flair/hyperopt

In [None]:
# start the optimization
param_selector.optimize(search_space, max_evals=5)

In [None]:
# load the model you trained
model = SequenceTagger.load('/resources/ner/flair/beto-0.35-True/model.pt')

In [None]:
# create example sentence
sentence = Sentence('El juez de la Cámara Federal porteña envió una carta a sus colegas en las que les reprochó el silencio ante las críticas de Alberto Fernández por el fallo que liberó a Revolución Federal.')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

In [None]:
# print the entities with below command
for entity in sentence.get_spans('ner'):
    print(entity)

In [None]:
sentence = Sentence("""
Acusado: Elvis Junior Tasayco Bardales, DNI n° 44.986.514.
Defensa Oficial: Marina Recabarra, -Defensoría Oficial Nro. 20-.
Fiscal: Adrián Dávila -Fiscalía Penal, Contravencional y de Faltas Nro. 36-.
DESARROLLO
Juez: Da inicio a la audiencia y explica que su objetivo es escuchar al acusado en virtud del acuerdo de juicio abreviado al que arribó junto a su Defensora Oficial y al Fiscal. Asimismo, le explica las características del trámite, sus consecuencias e implicancias y las condiciones a las que deberá someterse en caso de que homologue, es decir apruebe, dicho acuerdo y dictar sentencia.
""")

In [None]:
# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

In [None]:
# print the entities with below command
for entity in sentence.get_spans('ner'):
    print(entity)