In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os

os.chdir("..")

from embeddings.data.data_loader import HuggingFaceDataLoader
from embeddings.defaults import DATASET_PATH, RESULTS_PATH
from embeddings.embedding.auto_flair import AutoFlairWordEmbedding
from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator
from embeddings.model.flair_model import FlairModel
from embeddings.pipeline.standard_pipeline import StandardPipeline
from embeddings.task.flair_task.sequence_labeling import SequenceLabeling
from embeddings.transformation.flair_transformation.column_corpus_transformation import (
    ColumnCorpusTransformation,
)
from embeddings.data.dataset import Dataset

from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (
    DownsampleFlairCorpusTransformation,
)
from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (
    SampleSplitsFlairCorpusTransformation,
)
from embeddings.utils.utils import build_output_path

### Run downsampled flair pipeline

In [10]:
embedding_name_or_path = "allegro/herbert-base-cased"
dataset_name = "clarin-pl/kpwr-ner"

output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)

dataset = Dataset(dataset_name)
data_loader = HuggingFaceDataLoader()
transformation = (
    ColumnCorpusTransformation("tokens", "ner")
    .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))
    .then(DownsampleFlairCorpusTransformation(downsample_train=0.005, downsample_dev=0.01, downsample_test=0.01))
)
task = SequenceLabeling(
    output_path,
    hidden_size=256,
    task_train_kwargs={"max_epochs": 1, "mini_batch_size": 64},
)
embedding = AutoFlairWordEmbedding.from_hub(embedding_name_or_path)
model = FlairModel(embedding, task)
evaluator = SequenceLabelingEvaluator()

pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator)



In [11]:
_ = pipeline.run()

Using custom data configuration default
Reusing dataset kpwrner (/home/djaniak/.cache/huggingface/datasets/clarin-pl___kpwrner/default/0.0.0/001e3d471298007e8412e3a6ccc06bec000dec1bce0cf8e0ba7e5b7e105b1342)


  0%|          | 0/2 [00:00<?, ?it/s]

2022-04-08 00:41:43,175 - embeddings.transformation.flair_transformation.corpus_transformation - INFO - Info of ['train', 'test']:
{'builder_name': 'kpwrner',
 'citation': '',
 'config_name': 'default',
 'dataset_size': 13212646,
 'description': 'KPWR-NER tagging dataset.',
 'download_checksums': {'https://huggingface.co/datasets/clarin-pl/kpwr-ner/resolve/main/data/kpwr-ner-n82-test.iob': {'checksum': '7b86fd227605b7e5f807eedbcd87573271d8adb86cfddf56c763b1751e71a924',
                                                                                                                       'num_bytes': 2247780},
                        'https://huggingface.co/datasets/clarin-pl/kpwr-ner/resolve/main/data/kpwr-ner-n82-train-tune.iob': {'checksum': '7ab673f299b3a9e875c2c46ef1051807d98f923f0356d0be78556c832481efea',
                                                                                                                             'num_bytes': 6719818}},
 'download_size': 8967598,
 'f

2022-04-08 00:41:49,439 ----------------------------------------------------------------------------------------------------
2022-04-08 00:41:49,442 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(50000, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): D

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Load model from checkpoint

In [12]:
!ls $output_path

dev.tsv  final-model.pt  loss.tsv  test.tsv  training.log  weights.txt


In [13]:
output_path

PosixPath('/home/djaniak/Projects/embeddings/resources/results/allegro__herbert-base-cased/clarin-pl__kpwr-ner/20220408_004127')

In [26]:
from flair.models import SequenceTagger

task_from_ckpt = SequenceLabeling.from_checkpoint(checkpoint_path=(output_path / "final-model.pt"), output_path=output_path)
# trained_model = SequenceTagger.load(output_path / "final-model.pt")

2022-04-08 00:55:58,980 loading file /home/djaniak/Projects/embeddings/resources/results/allegro__herbert-base-cased/clarin-pl__kpwr-ner/20220408_004127/final-model.pt


TypeError: __init__() missing 1 required positional argument: 'hidden_size'

### Predict for test data

In [None]:
loaded_data = data_loader.load(dataset)
transformed_data = transformation.transform(loaded_data)
test_data = transformed_data.test

In [None]:
task.remove_labels_from_data(test_data, "predicted")

loss = trained_model.predict(
    sentences=test_data, mini_batch_size=64, label_name="predicted", return_loss=True,
)

y_pred = task.get_y(test_data, y_type="predicted", y_dictionary=task.y_dictionary)
y_true = task.get_y(test_data, task.y_type, task.y_dictionary)

task.remove_labels_from_data(test_data, "predicted")

In [None]:
_ = evaluator.evaluate({"y_pred": y_pred, "y_true": y_true})