<a name="setup"></a>
# Initial Set-Up

Select the correct working directory to save the models depending on the platform we are working on:

In [None]:
import os

try:
  from google.colab import drive
  
  drive.mount('/content/drive')
except:
  pass

if os.path.isdir('/content/drive'): # Google Drive
  save_dir = '/content/drive/MyDrive/models'
  platform = 'colab'
elif os.path.isdir('kaggle/working'): # Kaggle
  save_dir = 'kaggle/working'
  platform = 'kaggle'
else: # Others
  save_dir = './'
  platform = 'other'

In [None]:
!pip install flair==0.10

# PoS Tagger

In [None]:
from flair.datasets import UD_BASQUE
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

## Opening the Corpus

We can check the format of the universal dependencies PoS tags on [UniversalDependencies/UD_Basque-BDT](https://github.com/UniversalDependencies/UD_Basque-BDT) GitHub repository.

Flair already comes with a helper function to load the dataset:

In [None]:
corpus = UD_BASQUE()
print(corpus)

In [None]:
# 2. what label do we want to predict?
label_type = 'upos'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

## Corpus Analysis

We are going to inspect the corpus before using it.

We can get some global statistics first:

In [None]:
print(corpus.obtain_statistics())

Get the label distribution for UPOS tags on the different splits:

In [None]:
from collections import defaultdict

# Based on Corpus.get_label_distribution() method (fixed):
def get_label_distribution(sentences, label_type):
    class_to_count = defaultdict(lambda: 0)
    for sent in sentences:
        for label in sent.get_labels(label_type):
            class_to_count[label.value] += 1
    return class_to_count

distro_train = get_label_distribution(corpus.train, 'upos')
distro_dev = get_label_distribution(corpus.dev, 'upos')
distro_test = get_label_distribution(corpus.test, 'upos')
print(distro_train)

And plot them together:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Merge the labels of all the splits:
labels = set(distro_train.keys()).union(distro_dev.keys()).union(distro_test.keys())
labels = list(labels)

x = np.arange(len(labels))  # the label locations
width = 0.25  # the width of the bars
y1 = np.array(list(distro_train.values()))
y2 = np.array(list(distro_dev.values()))
y3 = np.array(list(distro_test.values()))

# Percentage:
y1 = y1 / y1.sum() * 100
y2 = y2 / y2.sum() * 100
y3 = y3 / y3.sum() * 100

fig, ax = plt.subplots(figsize=(2*5, 3)) # reduce width to 5 for the SVG
rects1 = ax.bar(x - width, y1, width, label='train')
rects2 = ax.bar(x,         y2, width, label='dev')
rects3 = ax.bar(x + width, y3, width, label='test')

ax.set_ylabel('Percentage')
ax.set_xlabel('Label')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

fig.tight_layout()
plt.xticks(rotation=90)
# plt.savefig('/kaggle/working/upos_label_distro.svg')
plt.show()

## Loading Embeddings

This code loads the embedding we will use to train all the models below.

In [None]:
# 4. initialize embeddings
embedding_types = [
    WordEmbeddings('eu'),
    FlairEmbeddings('eu-forward'),
    FlairEmbeddings('eu-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

## Training

We train the initial universal PoS model here, using a sequence tagger style model:

In [None]:
# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [None]:
# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

We start the training process now.

This will take around 45 minutes to train in Colab.

In [None]:
%%time
# 7. start training
trainer.train(f'{save_dir}/basque_upos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=200)

## Plot Training Curves

Here will use a modified version of the [`plot_training_curves`](https://github.com/flairNLP/flair/blob/v0.10/flair/visual/training_curves.py#L170-L210) method from the Plotter class in flair. We just added some tweaks to draw the plot with a different size and outputting it in SVG format.

Internally, we use the [`Plotter._extract_evaluation_data()`](https://github.com/flairNLP/flair/blob/v0.10/flair/visual/training_curves.py#L30-L79) function that reads the training results on each epoch from the **loss.tsv** file.

In [None]:
from pathlib import Path
from typing import Dict, List, Union
import numpy as np
import matplotlib.pyplot as plt
from flair.visual.training_curves import Plotter, Path

def plot_training_curves_svg(
    file_name, plot_values=['loss', 'F1'], size=(5, 7), xlim=[1, None]
):
    """Generates a plot in SVG format and smaller, useful for the paper."""
    file_name = Path(file_name)

    fig = plt.figure(figsize=size)

    for plot_no, plot_value in enumerate(plot_values):

        training_curves = Plotter._extract_evaluation_data(file_name, plot_value)

        plt.subplot(len(plot_values), 1, plot_no + 1)
        if training_curves['train']['score']:
            x = np.arange(0, len(training_curves['train']['score']))
            plt.plot(x+1, training_curves['train']['score'], label=f'training {plot_value}')
        if training_curves['dev']['score']:
            x = np.arange(0, len(training_curves['dev']['score']))
            plt.plot(x+1, training_curves['dev']['score'], label=f'validation {plot_value}')
        if training_curves['test']['score']:
            x = np.arange(0, len(training_curves['test']['score']))
            plt.plot(x+1, training_curves['test']['score'], label=f'test {plot_value}')
        # Position legend inside the plot on places that usually do not overlap
        if plot_value == 'loss': # this plot usually goes decreasing
          plt.legend(bbox_to_anchor=(1.04, 1.0), loc='upper right', borderaxespad=3)
        else:
          plt.legend(bbox_to_anchor=(1.04, 0), loc='lower right', borderaxespad=3)
        plt.ylabel(plot_value)
        plt.xlabel('epochs')
        plt.xlim(xlim)

    # save plots
    path = file_name.parent / 'training.svg'
    plt.savefig(path)
    print(f'Loss and F1 plots are saved in {path}')
    # plt.show(block=False) # no need to show, just save it
    plt.close(fig)

We create the plots using the function above. We first show the plot here in the notebook. Then, we save it in SVG format with a more appropriate size to be added into an article.

In [None]:
import matplotlib.pyplot as plt
from flair.visual.training_curves import Plotter

plotter = Plotter()
plotter.plot_training_curves(f'{save_dir}/basque_upos/loss.tsv')
# Plot a more friendly size for the paper:
plot_training_curves_svg(f'{save_dir}/basque_upos/loss.tsv')

Looking at the training curve, after 25-30 epochs, there is not much improvement. Later, around 50 epochs, the learning curve becames very stable and apparently there is no more progress.

## Testing

In this section, we just test a sentence in our recently trained universal PoS tagger model.

In [None]:
from flair.data import Sentence

In [None]:
# load the model you trained
model = SequenceTagger.load(f'{save_dir}/basque_upos/final-model.pt')

# create example sentence
sentence = Sentence('Familian, aldiz, ez da inolako hazkunderik antzematen euskararen erabileran, elebidun gazteenen gurasoak erdaldunak direlako oraindik.')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

## Memory Cleaning

Remove the model from the memory to free the GPU:

In [None]:
import gc
import torch

# Memory cleaning of the model in the tests above
try:
  del trainer
except NameError:
  pass
try:
  del tagger
except NameError:
  pass
gc.collect()
torch.cuda.empty_cache()

# Multi-label Classification (2-label)

The goal here is to train models to predict multiple labels together, like upos+case+number.

Let us see how we can merge different tags in the same corpus in flair — for example, PoS and number tags. To know how to do this, we can check the code of the [`DataPoing`](https://github.com/flairNLP/flair/blob/v0.10/flair/data.py#L265-L329) class. This is a parent class of `Token` and implements some methods like [`get_labels()`](https://github.com/flairNLP/flair/blob/v0.10/flair/data.py#L318-L322) and [`add_label()`](https://github.com/flairNLP/flair/blob/v0.10/flair/data.py#L289-L296) that will be helpful to do our task.

In [None]:
labels = ['upos', 'number']

for partition in [corpus.train, corpus.dev, corpus.test]:
  for sentence in partition:
    for token in sentence:
      label1 = token.get_labels(labels[0])
      # If the label has no annotation for this example, it returns empty array:
      if len(label1) == 0:
        continue
      label1 = label1[0]
      label2 = token.get_labels(labels[1])
      if len(label2) == 0:
        continue
      label2 = label2[0]
      # We will generate the 'upos+number' label merging both previous labels:
      token.add_label(f'{labels[0]}+{labels[1]}', f'{label1}+{label2}')

We can check if it worked:

In [None]:
corpus.make_label_dictionary(label_type='upos+number')

## Generate Lemma Rules

Using lemma tags directly for training will not work. It will generate a too big dictionary due to an enormous amount of different label values on the lemma, which will exhaust all the GPU memory. Apparently, there is still no official solution on Flair to train with `lemma` label ([flair#1319](https://github.com/flairNLP/flair/issues/1319), [flair#2197](https://github.com/flairNLP/flair/issues/2197)).

To solve this, we can generate a column with the minimum script edits to change from the origin token to the lemma. For that, we can use the script from [CoNLL-UD-2018/UDPipe-Future](https://github.com/CoNLL-UD-2018/UDPipe-Future).

In [None]:
!wget https://raw.githubusercontent.com/CoNLL-UD-2018/UDPipe-Future/master/ud_dataset.py -O ud_dataset.py 

Let us see how the ud_dataset.py script works:

In [None]:
import ud_dataset

word = 'abantail'
lemma = 'abantaila'
allow_copy = False
ud_dataset._gen_lemma_rule(word, lemma, allow_copy)

We can now create a lemma_rule label type on our corpus using this function.

In [None]:
from tqdm.notebook import tqdm
import ud_dataset

allow_copy = False

for partition in tqdm([corpus.train, corpus.dev, corpus.test], leave=False):
  for sentence in tqdm(partition, leave=False):
    for token in sentence:
      word = token.text
      lemma = token.get_labels('lemma')[0].value
      lemma_rule = ud_dataset._gen_lemma_rule(word, lemma, allow_copy)
      token.add_label('lemma_rule', lemma_rule)

## Select the Labels

An interesting approach would be to get the list of labels of the corpus and choose the most interesting ones, considering the number of times the label is present in the corpus. Some labels can be present only in a few examples and are not worth it.

To get the number of labels in a corpus using flair, we can use a code based on the [`make_label_dictionary()`](https://github.com/flairNLP/flair/blob/v0.10/flair/data.py#L1382-L1446) method from the `Corpus` class.

In [None]:
from collections import Counter
import typing
from torch.utils.data.dataset import ConcatDataset
from flair.datasets import DataLoader

data = ConcatDataset(corpus.train)
all_label_types = Counter()
for sentence in map(lambda x: x[0], DataLoader(data, batch_size=1, num_workers=0)):
  all_label_types.update(sentence.annotation_layers.keys())

labels = all_label_types.most_common() # sort by frequency
print(labels)

From all the labels, we will select some interesting labels based on their occurrence:

In [None]:
min_count = 10000 # minimum occurrences of the label to be taken into account

nice_labels = []
for label in labels:
  if label[0] == 'lemma' or label[0] == 'pos':
    # Ignore "lemma", would generate too many possible values;
    # "pos" is filled all with "_", so should be ignored too.
    continue
  # We can ignore: [sth] used for subtags and + for multi-labels generated below
  if label[1] > min_count and not '+' in label[0] and not '[' in label[0]:
    nice_labels.append(label[0])

print("\n".join(nice_labels))

## Generate Multi-label Corpus

Now, to generate the multi-label, we can group the PoS label with the other interesting labels found. The following code will generate a variable called `label_pairs` with some interesting tags to merge and try:

In [None]:
import itertools

pos_labels = ['upos']
# At first we added "pos" here, later we realized that column is just empty.
# We leave the possibility to add it in case in the future somebody adds them.

nopos_labels = list(set(nice_labels) - set(pos_labels))

label_pairs = []
for pair in itertools.product(pos_labels, nopos_labels):
    label_pairs.append(pair)

print(label_pairs)

Generating all these labels in the corpus, based on the code above:

In [None]:
from tqdm.notebook import tqdm

for label_pair in tqdm(label_pairs, leave=False, position=0):
  for partition in tqdm([corpus.train, corpus.dev, corpus.test], leave=False, position=1):
    for sentence in tqdm(partition, leave=False, position=2):
      for token in sentence:
        label1 = token.get_labels(label_pair[0])
        # If the label has no annotation for this example, it returns empty array:
        if len(label1) == 0:
          continue
        label1 = label1[0]
        label2 = token.get_labels(label_pair[1])
        if len(label2) == 0:
          continue
        label2 = label2[0]
        # We will generate the 'upos+number' label merging both previous labels:
        token.add_label(f'{label_pair[0]}+{label_pair[1]}', f'{label1}+{label2}')

## PoS Multi-label Training

We will create two helper functions to do the training and plotting.

In [None]:
import matplotlib.pyplot as plt
from flair.visual.training_curves import Plotter
import pandas as pd

def train_sequence_tagger(save_dir, corpus, embeddings, label_type, epochs=200):
  """A function to do the full training of a model using a specific label."""
  path = f'{save_dir}/basque_{label_type}'

  label_dict = corpus.make_label_dictionary(label_type=label_type)
  tagger = SequenceTagger(
      hidden_size=256, embeddings=embeddings,
      tag_dictionary=label_dict, tag_type=label_type,
      use_crf=True
  )
  trainer = ModelTrainer(tagger, corpus)
  trainer.train(path, learning_rate=0.1, mini_batch_size=32, max_epochs=epochs)

def plot_sequence_tagger(save_dir, label_type):
  """Shows a plot and saves it as a SVG too with a more paper-friendly size."""
  path = f'{save_dir}/basque_{label_type}'

  # Draw two plots, one in big size for inspection here, another smaller for
  # the paper
  plotter = Plotter()
  plotter.plot_training_curves(f'{path}/loss.tsv')
  # Plot a more friendly size for the paper:
  plot_training_curves_svg(f'{path}/loss.tsv')

def trained_latest_epoch(label_type):
    """Returns the last epoch the model has been trained on."""
    file_name = f'{save_dir}/basque_{label_type}/loss.tsv'
    if not os.path.exists(file_name) or os.stat(file_name).st_size == 0:
      return -1
    df = pd.read_csv(file_name, delimiter="\t")
    return df.iloc[-1]['EPOCH']

We show above that, for the PoS tagging label, after 25-30 epochs, we did not get much improvement. So let us train against these combined labels just for 25 epochs

Now we will train multiple models with different multi-label pairs and plot them. This will also generate the loss and plots inside the model directory for later inspection.

Note: *Approximate running time in Colab: 3 hours*

In [None]:
%%time
from tqdm.notebook import tqdm

epochs = 25

# Train all the multi-label models:
for label_pair in tqdm(label_pairs):
  label_type = '+'.join(label_pair)
  print('=====================================================================')
  print(f'Label type: {label_type}')
  print('=====================================================================')
  if trained_latest_epoch(label_type) >= epochs:
    print('Skipping, already trained.')
  else:
    train_sequence_tagger(save_dir, corpus, embeddings, label_type, epochs=epochs)
  plot_sequence_tagger(save_dir, label_type)

## Plot Multi-label Training Curves

We can plot the pair label results and compare them.

Based on the plotting function above, we can create a new plotting function including different label results together on the same plot.

In [None]:
import math
import os
from pathlib import Path
from typing import Dict, List, Union
import numpy as np
import matplotlib.pyplot as plt
from flair.visual.training_curves import Plotter, Path

def plot_label_pairs_curve(
    label_pairs, name, plot_values=['loss', 'F1'], format='png',
    xlim=[0, None], ls=None
):
  """Plot multi-label results togehter on the same plot."""
  if format == 'svg':
    size = (5, 7)
  else: # png
    size = (15, 10)

  fig = plt.figure(figsize=size)

  for label_pair in label_pairs: # [pos, number], [pos, dependency], ...
    # Line style setup and create label name merging the feature names:
    if ls is None:
      if type(label_pair) == str:
        linestyle = '-' if '+' in label_pair or '_' in label_pair else '--' # single label, like pos only, will be dashed
        label_name = label_pair
      else: # list()
        linestyle = '--' if len(label_pair) == 1 else '-' # single label, like pos only, will be dashed
        label_name = '+'.join(label_pair)
    else:
      linestyle = ls
      label_name = label_pair
    # Open the file with the training results if it exists
    file_name = Path(f'{save_dir}/basque_{label_name}/loss.tsv')
    if not os.path.exists(file_name):
      continue
    # We create a different plot for each score type:
    for plot_no, plot_value in enumerate(plot_values): # loss, F1, ...
        training_curves = Plotter._extract_evaluation_data(file_name, plot_value)
        plt.subplot(len(plot_values), 1, plot_no + 1) # position the plot
        x = np.arange(0, len(training_curves['dev']['score']))
        plt.plot(x+1, training_curves['dev']['score'], label=label_name, ls=linestyle)
        # Tweak the position and size of the legend:
        if plot_value == 'loss': # this plot usually goes decreasing
          pass # plt.legend(bbox_to_anchor=(0.98, 1.0), loc='upper left', borderaxespad=3)
        else:
          ncol = 2 if format == 'svg' else len(label_pairs)
          plt.legend(bbox_to_anchor=(0.5, -0.05), loc='upper center', borderaxespad=3, ncol=ncol)
        plt.ylabel(plot_value)
        plt.xlabel('epochs')
        plt.xlim(xlim)
  # Save the plots:
  path = file_name.parent.parent / f'{name}.{format}'
  print(f'Loss and F1 plots are saved in {path}')
  if format == 'svg':
    plt.savefig(path)
  else:
    plt.savefig(path, dpi=300)
    plt.show(block=False)
  plt.close(fig)

In [None]:
label_types = [pos_labels] + label_pairs

plot_label_pairs_curve(label_types, 'label_pairs', format='png', xlim=[1, epochs])
plot_label_pairs_curve(label_types, 'label_pairs', format='svg', xlim=[1, epochs])

## Non-PoS Labels Training

We can also train models with the non-pos labels separately (number, dependency, ...), just for comparison later with the PoS multi-label results

Note: *Approximate running time in Colab: 2h 30min*

In [None]:
%%time

for label_type in tqdm(nopos_labels):
  print('=====================================================================')
  print(f'Label type: {label_type}')
  print('=====================================================================')
  if trained_latest_epoch(label_type) >= epochs:
    print('Skipping, already trained.')
  else:
    train_sequence_tagger(save_dir, corpus, embeddings, label_type, epochs=epochs)
  plot_sequence_tagger(save_dir, label_type)

## Plot Non-PoS Training Curves

Create a plot that includes all the single label models just trained, using the function created above:

In [None]:
label_types = pos_labels + nopos_labels

plot_label_pairs_curve(
    label_types, 'label_types', format='png', xlim=[1, epochs], ls='-'
)
plot_label_pairs_curve(
    label_types, 'label_types', format='svg', xlim=[1, epochs], ls='-'
)

## Multi-label Results

Here we will extract the differente scores of the final models, and also the models best scores during all the training. The scores are read directly from the `loss.tsv` files generated during the training.

In [None]:
from collections import defaultdict
import pandas as pd

values = ['Loss', 'Precision', 'Recall', 'F1', 'Accuracy']
indexes = nopos_labels + pos_labels + ['+'.join(lp) for lp in label_pairs]

rows_final = []
rows_best = []
for label_name in indexes:
  row_final = {}
  row_best = {}
  for value in values:
    file_name = f'{save_dir}/basque_{label_name}/loss.tsv'
    df = pd.read_csv(file_name, delimiter="\t")
    row_final[value] = df[f'DEV_{value.upper()}'].iloc[-1] # final result
    if value == 'Loss':
      row_best[value] = df[f'DEV_{value.upper()}'].min() # best result lowest
    else:
      row_best[value] = df[f'DEV_{value.upper()}'].max() # best result highest
  rows_final.append(row_final)
  rows_best.append(row_best)

display(pd.DataFrame(rows_final, index=pd.Series(indexes, name='Final Result')))
display(pd.DataFrame(rows_best, index=pd.Series(indexes, name='Best Result')))

# 3-label Classification

To merge three features together, we will get combinations of length 2 with the UPOS label to generate 3-label tags.

In [None]:
import itertools

label_trios = []
for pair in itertools.combinations(nopos_labels, 2):
  for pos_label in pos_labels:
    trio = [pos_label] + list(pair)
    label_trios.append(trio)

print(label_trios)

## Generate the 3-label Corpus

We generate the labels in the corpus with a similar code as above, just adding the third label type:

In [None]:
from tqdm.notebook import tqdm

for label_trio in tqdm(label_trios, leave=False, position=0):
  for partition in tqdm([corpus.train, corpus.dev, corpus.test], leave=False, position=1):
    for sentence in tqdm(partition, leave=False, position=2):
      for token in sentence:
        label1 = token.get_labels(label_trio[0])
        # If the label has no annotation for this example, it returns empty array:
        if len(label1) == 0:
          continue
        label1 = label1[0]
        label2 = token.get_labels(label_trio[1])
        if len(label2) == 0:
          continue
        label2 = label2[0]
        label3 = token.get_labels(label_trio[2])
        if len(label3) == 0:
          continue
        label3 = label3[0]
        # We will generate the 'upos+number' label merging both previous labels:
        token.add_label(
            f'{label_trio[0]}+{label_trio[1]}+{label_trio[2]}',
            f'{label1}+{label2}+{label3}'
        )

## PoS 3-label Training

Here we have disabled the `lemma_rule` from the three label pieces of training because we do not have enough GPU memory for them. This is because when merging with the other two labels, the required memory is higher than the one provided in Google Colab.

*Approximate running time in Colab: 2 hours*

In [None]:
%%time
from tqdm.notebook import tqdm

epochs = 25

# Train all the multi-label models:
for label_trio in tqdm(label_trios):
  label_type = '+'.join(label_trio)
  if platform == 'colab' and 'lemma_rule' in label_type:
    print('Skipping, not enough GPU Memory in Colab.')
    continue # Avoid CUDA out of memory error
  print('=====================================================================')
  print(f'Label type: {label_type}')
  print('=====================================================================')
  if trained_latest_epoch(label_type) >= epochs:
    print('Skipping, already trained.')
  else:
    train_sequence_tagger(save_dir, corpus, embeddings, label_type, epochs=epochs)
  plot_sequence_tagger(save_dir, label_type)

## Plot PoS 3-label Training Curves

We can plot the training curves using the same functions as above:

In [None]:
# label_types = [pos_labels] + label_trios
label_types = label_trios

plot_label_pairs_curve(label_types, 'label_trios', format='png', xlim=[1, epochs])
plot_label_pairs_curve(label_types, 'label_trios', format='svg', xlim=[1, epochs])

## 3-label Results

Here we will extract the different scores of the final models and best models again, but now including the recently trained three feature models.

In [None]:
import os
from collections import defaultdict
import pandas as pd

values = ['Loss', 'Precision', 'Recall', 'F1', 'Accuracy']
indexes = nopos_labels + pos_labels + ['+'.join(lp) for lp in label_pairs + label_trios]

rows_final = []
rows_best = []
final_indexes = []
for label_name in indexes:
  row_final = {}
  row_best = {}
  file_name = f'{save_dir}/basque_{label_name}/loss.tsv'
  if not os.path.exists(file_name):
    continue
  final_indexes.append(label_name)
  for value in values:
    df = pd.read_csv(file_name, delimiter="\t")
    row_final[value] = df[f'DEV_{value.upper()}'].iloc[-1] # final result
    if value == 'Loss':
      row_best[value] = df[f'DEV_{value.upper()}'].min() # best result lowest
    else:
      row_best[value] = df[f'DEV_{value.upper()}'].max() # best result highest
  rows_final.append(row_final)
  rows_best.append(row_best)
indexes = final_indexes

final_df = pd.DataFrame(rows_final, index=pd.Series(indexes, name='Final Result'))
display(final_df)
best_df = pd.DataFrame(rows_best, index=pd.Series(indexes, name='Best Result'))
display(best_df)

It may be interesting the get the number of annotations per label too. This can be easily calculated looping through all the sentece annotations:

In [None]:
from collections import Counter
import typing
from torch.utils.data.dataset import ConcatDataset
from flair.datasets import DataLoader

data = ConcatDataset(corpus.train)
all_label_types = Counter()
for sentence in map(lambda x: x[0], DataLoader(data, batch_size=1, num_workers=0)):
  all_label_types.update(sentence.annotation_layers.keys())

for label_type, count in dict(all_label_types).items():
  if label_type in indexes:
    print(f'{label_type}: {count}')

# Final Results

With the results above, we can calculate how adding each of the features affected the models, both in 2-label and 3-label type models. We are interested in geting the average change, and the standard deviation to check for stability.

In [None]:
import os
from collections import defaultdict
import numpy as np
import pandas as pd

scores = ['Loss', 'Precision', 'Recall', 'F1', 'Accuracy']

# Get the differences in F1 when adding each label_type for 2-label models
scores = ['Precision', 'Recall', 'F1', 'Accuracy']
label2_diff = defaultdict(dict)
for score in scores:
  for label in nopos_labels:
    label1_result = final_df[score][label]
    label2_results = []
    label3_results = []
    for label2 in indexes:
      if not label in label2:
        continue
      if label2.count('+') == 1: # 2-label model
        label2_results.append(final_df[score][label2])
    label2_diff[score][label] = np.array(label2_results) - label1_result
print('2-label differences:')
for score, score_values in label2_diff.items():
  for label, values in score_values.items():
    print(f'  {label} {score}: {values}')

# Get the differences in F1 when adding each label_type for 3-label models
label3_diff = defaultdict(lambda: defaultdict(list))
for score in scores:
  for label2 in [i for i in indexes if i.count('+') == 1]:
    label2_list = label2.split('+')
    for label3 in [i for i in indexes if i.count('+') == 2]:
      label3_list = label3.split('+')
      label_diff = list(set(label3_list) - set(label2_list))
      if len(label_diff) != 1:
        continue
      label1 = label_diff[0]
      label3_diff[score][label1].append(final_df[score][label3] - final_df[score][label2])
print('3-label differences:')
for score, score_values in label3_diff.items():
  for label, values in score_values.items():
    print(f'  {label} {score}: {values}')

# # Generate the table from label2_diff and label3_diff
results_diff = defaultdict(dict)
for score in scores:
  for label in nopos_labels:
    # 2-labels:
    values = np.array(label2_diff[score][label]) * 100
    results_diff[label][f'{score}:2-label'] = f'{values.mean().round(4)}±{values.std().round(4)}'
    # 3-labels:
    values = np.array(label3_diff[score][label]) * 100
    results_diff[label][f'{score}:3-label'] = f'{values.mean().round(4)}±{values.std().round(4)}'

display(pd.DataFrame(results_diff).T)

# Testing Sentences

We will test a sentence in all the models in this last section.

For more information about the CoNLL-U format: https://universaldependencies.org/format.html

In order to use this code, we need to run the [Initial Set-Up](#setup) section first and maybe adjust the `save_dir` variable to point to the correct path with the models.

In [None]:
import glob
import re
from tqdm.notebook import tqdm
from flair.models import SequenceTagger
from flair.data import Sentence

example = 'Familian, aldiz, ez da inolako hazkunderik antzematen euskararen erabileran, elebidun gazteenen gurasoak erdaldunak direlako oraindik.'

dirs = glob.glob(f'{save_dir}/basque_*')

for path in tqdm(dirs):
    # load the model you trained
    model = SequenceTagger.load(f'{path}/final-model.pt')
    # create example sentence
    sentence = Sentence(example)
    # predict tags and print
    model.predict(sentence)
    # Print tagged sentence:
    name = re.sub(r'^.*basque_(.*)$', r'\1', path)
    print(f'Model: {name}')
    print(sentence.to_tagged_string())
    print("\n")