In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.chdir("..")


from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, Tuple

import datasets
import flair
import numpy as np
import pytest
import torch
from embeddings.data.data_loader import HuggingFaceDataLoader
from embeddings.data.dataset import HuggingFaceDataset
from embeddings.defaults import RESULTS_PATH
from embeddings.embedding.auto_flair import AutoFlairWordEmbedding
from embeddings.embedding.flair_embedding import FlairEmbedding
from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator
from embeddings.model.flair_model import FlairModel
from embeddings.pipeline.standard_pipeline import StandardPipeline
from embeddings.task.flair_task.sequence_labeling import SequenceLabeling
from embeddings.transformation.flair_transformation.column_corpus_transformation import (
    ColumnCorpusTransformation,
)
from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (
    DownsampleFlairCorpusTransformation,
)
from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (
    SampleSplitsFlairCorpusTransformation,
)
from flair.data import Corpus
from numpy import typing as nptyping

### Run downsampled flair pipeline

In [None]:
result_path = TemporaryDirectory()

dataset = HuggingFaceDataset("clarin-pl/kpwr-ner")
data_loader = HuggingFaceDataLoader()
transformation = (
    ColumnCorpusTransformation("tokens", "ner")
    .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))
    .then(DownsampleFlairCorpusTransformation(percentage=0.005))
)
task = SequenceLabeling(
    result_path.name,
    hidden_size=256,
    task_train_kwargs={"max_epochs": 1, "mini_batch_size": 256},
)
embedding = AutoFlairWordEmbedding.from_hub("allegro/herbert-base-cased")
model = FlairModel(embedding, task)
evaluator = SequenceLabelingEvaluator()

pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator)

In [None]:
_ = pipeline.run()

### Load model from checkpoint

In [None]:
!ls $result_path.name

In [None]:
from flair.models import SequenceTagger

trained_model = SequenceTagger.load(result_path.name + "/final-model.pt")

### Predict for test data

In [None]:
loaded_data = data_loader.load(dataset)
transformed_data = transformation.transform(loaded_data)
test_data = transformed_data.test

In [None]:
task.remove_labels_from_data(test_data, "predicted")

loss = trained_model.predict(
    sentences=test_data, mini_batch_size=64, label_name="predicted", return_loss=True,
)

y_pred = task.get_y(test_data, y_type="predicted", y_dictionary=task.y_dictionary)
y_true = task.get_y(test_data, task.y_type, task.y_dictionary)

task.remove_labels_from_data(test_data, "predicted")

In [None]:
_ = evaluator.evaluate({"y_pred": y_pred, "y_true": y_true})