In [None]:
import flair, random, torch

from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import (
    FlairEmbeddings,
    TransformerWordEmbeddings,
    StackedEmbeddings,
)
from flair.models import SequenceTagger
from torch.optim.lr_scheduler import OneCycleLR
from flair.tokenization import SpaceTokenizer
from flair.trainers import ModelTrainer
from flair.visual.training_curves import Plotter


flair.device = torch.device("cuda")
torch.cuda.is_available()

## Prepare dataset

In [None]:
# define columns
columns = {0: "text", 1: "ner"}

# this is the folder in which train, test and dev files reside
data_folder = "/resources/data/restricted/anonymization/"

# 1. init a corpus using column format, data folder and the names of the train, dev and test files
corpus = ColumnCorpus(
    data_folder,
    columns,
    train_file="train.txt",
    test_file="test.txt",
    dev_file="dev.txt",
)

In [None]:
for i in range(10):
    print(corpus.train[i])

In [None]:
for i in range(10):
    print(corpus.dev[i])

In [None]:
for i in range(10):
    print(corpus.test[i])

In [None]:
# 2. what label do we want to predict?
label_type = "ner"

In [None]:
# 3. make the label dictionary from the corpus
vocab_dictionary = corpus.make_vocab_dictionary()
print(vocab_dictionary)

In [None]:
# 4. make the vocab dictionary from the corpus
label_dictionary = corpus.make_label_dictionary(label_type=label_type, add_unk=True)

In [None]:
print(corpus.obtain_statistics())

In [None]:
import pandas as pd
from ast import literal_eval

stats = literal_eval(corpus.obtain_statistics())

In [None]:
pd.Series(stats["TRAIN"]["number_of_documents_per_class"]).sort_values(ascending=False).plot(
    kind="bar", title="Train set - number of documents per label"
)

In [None]:
len(stats["TRAIN"]["number_of_documents_per_class"].keys())

In [None]:
pd.Series(stats["DEV"]["number_of_documents_per_class"]).sort_values(ascending=False).plot(
    kind="bar", title="Dev set - number of documents per label"
)

In [None]:
len(stats["DEV"]["number_of_documents_per_class"].keys())

In [None]:
set(stats["TRAIN"]["number_of_documents_per_class"].keys()).symmetric_difference(
    set(stats["DEV"]["number_of_documents_per_class"].keys())
)

In [None]:
pd.Series(stats["TEST"]["number_of_documents_per_class"]).sort_values(ascending=False).plot(
    kind="bar", title="Test set - number of documents per label"
)

In [None]:
len(stats["TEST"]["number_of_documents_per_class"].keys())

In [None]:
set(stats["TRAIN"]["number_of_documents_per_class"].keys()).symmetric_difference(
    set(stats["TEST"]["number_of_documents_per_class"].keys())
)

## Beto

#### No fine-tuning

In [None]:
# 5. initialize NON fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(
    model="dccuchile/bert-base-spanish-wwm-cased",
    layers="-1",
    subtoken_pooling="first",
    fine_tune=False,
    use_context=True,
    allow_long_sentences=True,
)

In [None]:
# 6. initialize sequence tagger
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dictionary,
    tag_type=label_type,
    use_crf=True,
    use_rnn=True,
    reproject_embeddings=True,
)

In [None]:
# 7. initialize trainer
trainer = ModelTrainer(
    tagger,
    corpus,
)

In [None]:
# 8. run training for 50 epochs
path = "/resources/ner/flair/anonymizer"

trainer.train(
    path,
    learning_rate=0.1,
    mini_batch_size=8,
    max_epochs=50,
    # scheduler=OneCycleLR,
    embeddings_storage_mode="none",
    weight_decay=0.0,
    use_final_model_for_eval=False,
)

In [None]:
plotter = Plotter()
plotter.plot_training_curves(f"{path}/loss.tsv", ["TRAIN_LOSS", "DEV_LOSS"])

## Evaluation

In [None]:
path = "/resources/ner/flair/anonymizer"

In [None]:
# load model
tagger = SequenceTagger.load(f"{path}/best-model.pt")

In [None]:
# rewrite `label_dictionary` attribute to handle unknown items
tagger.label_dictionary = label_dictionary

In [None]:
evaluation = tagger.evaluate(
    corpus.test,
    label_type,
    path + "/evaluation.txt",
)

In [None]:
print(evaluation.main_score, evaluation.loss)

In [None]:
print(evaluation.detailed_results)

In [None]:
import re
import pandas as pd

pd.set_option("display.max_rows", 100)

path = "/resources/ner/flair/anonymizer"
df = pd.read_csv(f"{path}/evaluation.txt", sep="\s", header=None)
df.columns = ["token", "label", "pred"]
df.head()

In [None]:
df.info()

In [None]:
df["label"].value_counts(normalize=True)

In [None]:
# Exact match
df["match"] = df["label"] == df["pred"]
df["match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] != "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] == "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[(df["label"] == "O") & (df["match"] != 1)]

In [None]:
normalize_class = lambda x: re.sub(r"B-|I-", "", x)

df["normalized_label"] = df["label"].map(normalize_class)
df["normalized_pred"] = df["pred"].map(normalize_class)

In [None]:
df.head()

In [None]:
# Normalized exact match
df["normalized_match"] = df["normalized_label"] == df["normalized_pred"]
df["normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] != "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] == "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df["normalized_pred"].value_counts(normalize=True)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(df["label"], df["pred"]))

In [None]:
print(classification_report(df["normalized_label"], df["normalized_pred"]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(20, 20))

labels = df["normalized_label"].unique()

cm = confusion_matrix(
    df["normalized_label"],
    df["normalized_pred"],
    labels=labels,
    normalize="true",
)

sns.heatmap(
    cm,
    vmin=0.0,
    vmax=1.0,
    cmap="Blues",
    annot=True,
    fmt=".2f",
    cbar=False,
    xticklabels=labels,
    yticklabels=labels,
)

plt.title("Confusion Matrix", fontdict={"fontsize": 20})

## Inference

In [None]:
for sentence in corpus.test:
    tagger.predict(sentence)
    print(sentence)
    print()
    # print the entities with below command
    for entity in sentence.get_spans("ner"):
        print(entity)
    print("=" * 5)