<a href="https://colab.research.google.com/github/Baekhyunjung/study_nlp/blob/main/continual%20learning/NERDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install NERDA-Con

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting NERDA-Con
  Downloading NERDA_Con-0.0-py3-none-any.whl (21 kB)
Collecting progressbar
  Downloading progressbar-2.5.tar.gz (10 kB)
Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 6.6 MB/s 
[?25hCollecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 57.5 MB/s 
Building wheels for collected packages: progressbar, sklearn
  Building wheel for progressbar (setup.py) ... 

In [None]:
%pip install NERDA

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting NERDA
  Downloading NERDA-1.0.0-py3-none-any.whl (23 kB)
Installing collected packages: NERDA
Successfully installed NERDA-1.0.0


In [None]:

import csv
import os
import pyconll
from io import BytesIO
from itertools import compress
from pathlib import Path
from typing import Union, List, Dict
from urllib.request import urlopen
from zipfile import ZipFile
import ssl

def download_unzip(url_zip: str,
                   dir_extract: str) -> str:
    """Download and unzip a ZIP archive to folder.

    Loads a ZIP file from URL and extracts all of the files to a
    given folder. Does not save the ZIP file itself.

    Args:
        url_zip (str): URL to ZIP file.
        dir_extract (str): Directory where files are extracted.

    Returns:
        str: a message telling, if the archive was succesfully
        extracted. Obviously the files in the ZIP archive are
        extracted to the desired directory as a side-effect.
    """

    # suppress ssl certification
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    print(f'Reading {url_zip}')
    with urlopen(url_zip, context=ctx) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(dir_extract)

    return f'archive extracted to {dir_extract}'

def download_dane_data(dir: str = None) -> str:
    """Download DaNE data set.

    Downloads the 'DaNE' data set annotated for Named Entity
    Recognition developed and hosted by
    [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Args:
        dir (str, optional): Directory where DaNE datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.

    Returns:
        str: a message telling, if the archive was in fact
        succesfully extracted. Obviously the DaNE datasets are
        extracted to the desired directory as a side-effect.

    Examples:
        >>> download_dane_data()
        >>> download_dane_data(dir = 'DaNE')

    """
    # set to default directory if nothing else has been provided by user.
    if dir is None:
        dir = os.path.join(str(Path.home()), '.dane')

    return download_unzip(url_zip = 'http://danlp-downloads.alexandra.dk/datasets/ddt.zip',
                          dir_extract = dir)

def get_dane_data(split: str = 'train',
                  limit: int = None,
                  dir: str = None) -> dict:
    """Load DaNE data split.

    Loads a single data split from the DaNE data set kindly hosted
    by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Args:
        split (str, optional): Choose which split to load. Choose
            from 'train', 'dev' and 'test'. Defaults to 'train'.
        limit (int, optional): Limit the number of observations to be
            returned from a given split. Defaults to None, which implies
            that the entire data split is returned.
        dir (str, optional): Directory where data is cached. If set to
            None, the function will try to look for files in '.dane' folder in home directory.

    Returns:
        dict: Dictionary with word-tokenized 'sentences' and named
        entity 'tags' in IOB format.

    Examples:
        Get test split
        >>> get_dane_data('test')

        Get first 5 observations from training split
        >>> get_dane_data('train', limit = 5)

    """
    assert isinstance(split, str)
    splits = ['train', 'dev', 'test']
    assert split in splits, f'Choose between the following splits: {splits}'

    # set to default directory if nothing else has been provided by user.
    if dir is None:
        dir = os.path.join(str(Path.home()), '.dane')
    assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()'

    file_path = os.path.join(dir, f'ddt.{split}.conllu')
    assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()'

    split = pyconll.load_from_file(file_path)

    sentences = []
    entities = []

    for sent in split:
        sentences.append([token.form for token in sent._tokens])
        entities.append([token.misc['name'].pop() for token in sent._tokens])

    if limit is not None:
        sentences = sentences[:limit]
        entities = entities[:limit]

    return {'sentences': sentences, 'tags': entities}



def download_conll_data(dir: str = None) -> str:
    """Download CoNLL-2003 English data set.

    Downloads the [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/)
    English data set annotated for Named Entity Recognition.

    Args:
        dir (str, optional): Directory where CoNLL-2003 datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.

    Returns:
        str: a message telling, if the archive was in fact
        succesfully extracted. Obviously the CoNLL datasets are
        extracted to the desired directory as a side-effect.

    Examples:
        >>> download_conll_data()
        >>> download_conll_data(dir = 'conll')

    """
    # set to default directory if nothing else has been provided by user.
    if dir is None:
        dir = os.path.join(str(Path.home()), '.conll')

    return download_unzip(url_zip = 'https://data.deepai.org/conll2003.zip',
                          dir_extract = dir)

def get_conll_data(split: str = 'train',
                   limit: int = None,
                   dir: str = None) -> dict:
    """Load CoNLL-2003 (English) data split.

    Loads a single data split from the
    [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/)
    (English) data set.

    Args:
        split (str, optional): Choose which split to load. Choose
            from 'train', 'valid' and 'test'. Defaults to 'train'.
        limit (int, optional): Limit the number of observations to be
            returned from a given split. Defaults to None, which implies
            that the entire data split is returned.
        dir (str, optional): Directory where data is cached. If set to
            None, the function will try to look for files in '.conll' folder in home directory.

    Returns:
        dict: Dictionary with word-tokenized 'sentences' and named
        entity 'tags' in IOB format.

    Examples:
        Get test split
        >>> get_conll_data('test')

        Get first 5 observations from training split
        >>> get_conll_data('train', limit = 5)

    """
    assert isinstance(split, str)
    splits = ['train', 'valid', 'test']
    assert split in splits, f'Choose between the following splits: {splits}'

    # set to default directory if nothing else has been provided by user.
    if dir is None:
        dir = os.path.join(str(Path.home()), '.conll')
    assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'

    file_path = os.path.join(dir, f'{split}.txt')
    assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'

    # read data from file.
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter = ' ')
        for row in reader:
            data.append([row])

    sentences = []
    sentence = []
    entities = []
    tags = []

    for row in data:
        # extract first element of list.
        row = row[0]
        # TO DO: move to data reader.
        if len(row) > 0 and row[0] != '-DOCSTART-':
            sentence.append(row[0])
            tags.append(row[-1])
        if len(row) == 0 and len(sentence) > 0:
            # clean up sentence/tags.
            # remove white spaces.
            selector = [word != ' ' for word in sentence]
            sentence = list(compress(sentence, selector))
            tags = list(compress(tags, selector))
            # append if sentence length is still greater than zero..
            if len(sentence) > 0:
                sentences.append(sentence)
                entities.append(tags)
            sentence = []
            tags = []


    if limit is not None:
        sentences = sentences[:limit]
        entities = entities[:limit]

    return {'sentences': sentences, 'tags': entities}



In [None]:
"""
This section covers the interface for `NERDA` models, that is
implemented as its own Python class [NERDA.models.NERDA][].

The interface enables you to easily

- specify your own [NERDA.models.NERDA][] model
- train it
- evaluate it
- use it to predict entities in new texts.
"""
from NERDA_Con.datasets import get_conll_data
from NERDA_Con.networks import NERDANetwork
from NERDA_Con.predictions import predict, predict_text
from NERDA_Con.performance import compute_f1_scores, flatten
from NERDA_Con.training import train_model, train_model_new_task
import pandas as pd
import numpy as np
import torch
import os
import sys
import sklearn.preprocessing
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer, AutoConfig
from typing import List

class NERDA:

    def __init__(self,
                 transformer: str = 'bert-base-multilingual-uncased',
                 device: str = None,
                 tag_scheme: List[str] = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                 tag_outside: str = 'O',
                 dataset_training: dict = None,
                 dataset_validation: dict = None,
                 max_len: int = 128,
                 network: torch.nn.Module = NERDANetwork,
                 dropout: float = 0.1,
                 hyperparameters: dict = {'epochs' : 4,
                                          'warmup_steps' : 500,
                                          'train_batch_size': 13,
                                          'learning_rate': 0.0001},
                 tokenizer_parameters: dict = {'do_lower_case' : True},
                 validation_batch_size: int = 8,
                 num_workers: int = 1) -> None:


        # set device automatically if not provided by user.
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print("Device automatically set to:", self.device)
        else:
            self.device = device
            print("Device set to:", self.device)
        self.tag_scheme = tag_scheme
        self.tag_outside = tag_outside
        self.transformer = transformer
        self.dataset_training = dataset_training
        self.dataset_validation = dataset_validation
        self.hyperparameters = hyperparameters
        self.tag_outside = tag_outside
        self.tag_scheme = tag_scheme
        tag_complete = [tag_outside] + tag_scheme
        # fit encoder to _all_ possible tags.
        self.max_len = max_len
        self.tag_encoder = sklearn.preprocessing.LabelEncoder()
        self.tag_encoder.fit(tag_complete)
        self.transformer_model = AutoModel.from_pretrained(transformer)
        self.transformer_tokenizer = AutoTokenizer.from_pretrained(transformer, **tokenizer_parameters)
        self.transformer_config = AutoConfig.from_pretrained(transformer)
        self.network = NERDANetwork(self.transformer_model, self.device, len(tag_complete), dropout = dropout)
        self.network.to(self.device)
        self.validation_batch_size = validation_batch_size
        self.num_workers = num_workers
        self.train_losses = []
        self.valid_loss = np.nan
        self.quantized = False
        self.halved = False
        self.fisher_dict= {}
        self.opt_param_dict = {}
        self.task_id = 0
        self.shared_model = None

    def train(self) -> str:
        """Train Network

        Trains the network from the NERDA model specification.

        Returns:
            str: a message saying if the model was trained succesfully.
            The network in the 'network' attribute is trained as a
            side-effect. Training losses and validation loss are saved
            in 'training_losses' and 'valid_loss'
            attributes respectively as side-effects.
        """
        network, train_losses, valid_loss = train_model(network = self.network,
                                                        tag_encoder = self.tag_encoder,
                                                        tag_outside = self.tag_outside,
                                                        transformer_tokenizer = self.transformer_tokenizer,
                                                        transformer_config = self.transformer_config,
                                                        dataset_training = self.dataset_training,
                                                        dataset_validation = self.dataset_validation,
                                                        validation_batch_size = self.validation_batch_size,
                                                        max_len = self.max_len,
                                                        device = self.device,
                                                        num_workers = self.num_workers,
                                                        **self.hyperparameters)

        # attach as attributes to class
        setattr(self, "network", network)
        setattr(self, "train_losses", train_losses)
        setattr(self, "valid_loss", valid_loss)

        return "Model trained successfully"

    def train_next_task(self, new_dataset_training, new_dataset_validation) -> str:
        """Train Network

        Trains the network from the NERDA model specification.

        Returns:
            str: a message saying if the model was trained succesfully.
            The network in the 'network' attribute is trained as a
            side-effect. Training losses and validation loss are saved
            in 'training_losses' and 'valid_loss'
            attributes respectively as side-effects.
        """
        network, train_losses, valid_loss = train_model_new_task(network = self.network,
                                                        tag_encoder = self.tag_encoder,
                                                        tag_outside = self.tag_outside,
                                                        transformer_tokenizer = self.transformer_tokenizer,
                                                        transformer_config = self.transformer_config,
                                                        dataset_training = new_dataset_training,
                                                        dataset_validation = new_dataset_validation,
                                                        validation_batch_size = self.validation_batch_size,
                                                        max_len = self.max_len,
                                                        device = self.device,
                                                        num_workers = self.num_workers,
                                                        task_id = self.task_id,
                                                        fisher_dict=self.fisher_dict,
                                                        opt_param_dict = self.opt_param_dict,
                                                        shared_model = self.shared_model,
                                                        **self.hyperparameters)

        # attach as attributes to class
        setattr(self, "network", network)
        setattr(self, "train_losses", train_losses)
        setattr(self, "valid_loss", valid_loss)
        self.task_id += 1

        return "Model trained successfully"


    def load_network_from_file(self, model_path = "model.bin") -> str:
        """Load Pretrained NERDA Network from file

        Loads weights for a pretrained NERDA Network from file.

        Args:
            model_path (str, optional): Path for model file.
                Defaults to "model.bin".

        Returns:
            str: message telling if weights for network were
            loaded succesfully.
        """
        # TODO: change assert to Raise.
        assert os.path.exists(model_path), "File does not exist. You can download network with download_network()"
        self.network.load_state_dict(torch.load(model_path, map_location = torch.device(self.device)))
        self.network.device = self.device
        return f'Weights for network loaded from {model_path}'

    def save_network(self, model_path:str = "model.bin") -> None:
        """Save Weights of NERDA Network

        Saves weights for a fine-tuned NERDA Network to file.

        Args:
            model_path (str, optional): Path for model file.
                Defaults to "model.bin".

        Returns:
            Nothing. Saves model to file as a side-effect.
        """
        torch.save(self.network.state_dict(), model_path)
        print(f"Network written to file {model_path}")

    def quantize(self):
        """Apply dynamic quantization to increase performance.

        Quantization and half precision inference are mutually exclusive.

        Read more: https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html

        Returns:
            Nothing. Applies dynamic quantization to Network as a side-effect.
        """
        assert not (self.quantized), "Dynamic quantization already applied"
        assert not (self.halved), "Can't run both quantization and half precision"

        self.network = torch.quantization.quantize_dynamic(
            self.network, {torch.nn.Linear}, dtype=torch.qint8
        )
        self.quantized = True

    def half(self):
        """Convert weights from Float32 to Float16 to increase performance

        Quantization and half precision inference are mutually exclusive.

        Read more: https://pytorch.org/docs/master/generated/torch.nn.Module.html?highlight=half#torch.nn.Module.half

        Returns:
            Nothing. Model is "halved" as a side-effect.
        """
        assert not (self.halved), "Half precision already applied"
        assert not (self.quantized), "Can't run both quantization and half precision"

        self.network.half()
        self.halved = True

    def predict(self, sentences: List[List[str]],
                return_confidence: bool = False,
                **kwargs) -> List[List[str]]:
        """Predict Named Entities in Word-Tokenized Sentences

        Predicts word-tokenized sentences with trained model.

        Args:
            sentences (List[List[str]]): word-tokenized sentences.
            kwargs: arbitrary keyword arguments. For instance
                'batch_size' and 'num_workers'.
            return_confidence (bool, optional): if True, return
                confidence scores for all predicted tokens. Defaults
                to False.

        Returns:
            List[List[str]]: Predicted tags for sentences - one
            predicted tag/entity per word token.
        """
        return predict(network = self.network,
                       sentences = sentences,
                       transformer_tokenizer = self.transformer_tokenizer,
                       transformer_config = self.transformer_config,
                       max_len = self.max_len,
                       device = self.device,
                       tag_encoder = self.tag_encoder,
                       tag_outside = self.tag_outside,
                       return_confidence = return_confidence,
                       **kwargs)

    def predict_text(self, text: str,
                     return_confidence:bool = False, **kwargs) -> list:
        """Predict Named Entities in a Text

        Args:
            text (str): text to predict entities in.
            kwargs: arbitrary keyword arguments. For instance
                'batch_size' and 'num_workers'.
            return_confidence (bool, optional): if True, return
                confidence scores for all predicted tokens. Defaults
                to False.

        Returns:
            tuple: word-tokenized sentences and predicted
            tags/entities.
        """
        return predict_text(network = self.network,
                            text = text,
                            transformer_tokenizer = self.transformer_tokenizer,
                            transformer_config = self.transformer_config,
                            max_len = self.max_len,
                            device = self.device,
                            tag_encoder = self.tag_encoder,
                            tag_outside = self.tag_outside,
                            return_confidence=return_confidence,
                            **kwargs)

    def evaluate_performance(self, dataset: dict,
                             return_accuracy: bool=False,
                             **kwargs) -> pd.DataFrame:
        """Evaluate Performance

        Evaluates the performance of the model on an arbitrary
        data set.

        Args:
            dataset (dict): Data set that must consist of
                'sentences' and NER'tags'. You can look at examples
                 of, how the dataset should look like by invoking functions
                 get_dane_data() or get_conll_data().
            kwargs: arbitrary keyword arguments for predict. For
                instance 'batch_size' and 'num_workers'.
            return_accuracy (bool): Return accuracy
                as well? Defaults to False.


        Returns:
            DataFrame with performance numbers, F1-scores,
            Precision and Recall. Returns dictionary with
            this AND accuracy, if return_accuracy is set to
            True.
        """

        tags_predicted = self.predict(dataset.get('sentences'),
                                      **kwargs)

        # compute F1 scores by entity type
        f1 = compute_f1_scores(y_pred = tags_predicted,
                               y_true = dataset.get('tags'),
                               labels = self.tag_scheme,
                               average = None)

        # create DataFrame with performance scores (=F1)
        df = list(zip(self.tag_scheme, f1[2], f1[0], f1[1]))
        df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall'])

        # compute MICRO-averaged F1-scores and add to table.
        f1_micro = compute_f1_scores(y_pred = tags_predicted,
                                     y_true = dataset.get('tags'),
                                     labels = self.tag_scheme,
                                     average = 'micro')
        f1_micro = pd.DataFrame({'Level' : ['AVG_MICRO'],
                                 'F1-Score': [f1_micro[2]],
                                 'Precision': [np.nan],
                                 'Recall': [np.nan]})
        df = df.append(f1_micro)

        # compute MACRO-averaged F1-scores and add to table.
        f1_macro = compute_f1_scores(y_pred = tags_predicted,
                                     y_true = dataset.get('tags'),
                                     labels = self.tag_scheme,
                                     average = 'macro')
        f1_macro = pd.DataFrame({'Level' : ['AVG_MICRO'],
                                 'F1-Score': [f1_macro[2]],
                                 'Precision': [np.nan],
                                 'Recall': [np.nan]})
        df = df.append(f1_macro)

        # compute and return accuracy if desired
        if return_accuracy:
            accuracy = accuracy_score(y_pred = flatten(tags_predicted),
                                      y_true = flatten(dataset.get('tags')))
            return {'f1':df, 'accuracy': accuracy}

        return df


In [None]:
"""This section covers `torch` networks for `NERDA`"""
import torch
import torch.nn as nn
from transformers import AutoConfig
from NERDA_Con.utils import match_kwargs

class NERDANetwork(nn.Module):
    """A Generic Network for NERDA models.

    The network has an analogous architecture to the models in
    [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf).

    Can be replaced with a custom user-defined network with
    the restriction, that it must take the same arguments.
    """

    def __init__(self, transformer: nn.Module, device: str, n_tags: int, dropout: float = 0.1) -> None:
        """Initialize a NERDA Network

        Args:
            transformer (nn.Module): huggingface `torch` transformer.
            device (str): Computational device.
            n_tags (int): Number of unique entity tags (incl. outside tag)
            dropout (float, optional): Dropout probability. Defaults to 0.1.
        """
        super(NERDANetwork, self).__init__()

        # extract transformer name
        transformer_name = transformer.name_or_path
        # extract AutoConfig, from which relevant parameters can be extracted.
        transformer_config = AutoConfig.from_pretrained(transformer_name)

        self.transformer = transformer
        self.dropout = nn.Dropout(dropout)
        self.tags = nn.Linear(transformer_config.hidden_size, n_tags)
        self.device = device

    # NOTE: 'offsets 'are not used in model as-is, but they are expected as output
    # down-stream. So _DON'T_ remove! :)
    def forward(self,
                input_ids: torch.Tensor,
                masks: torch.Tensor,
                token_type_ids: torch.Tensor,
                target_tags: torch.Tensor,
                offsets: torch.Tensor) -> torch.Tensor:
        """Model Forward Iteration

        Args:
            input_ids (torch.Tensor): Input IDs.
            masks (torch.Tensor): Attention Masks.
            token_type_ids (torch.Tensor): Token Type IDs.
            target_tags (torch.Tensor): Target tags. Are not used
                in model as-is, but they are expected downstream,
                so they can not be left out.
            offsets (torch.Tensor): Offsets to keep track of original
                words. Are not used in model as-is, but they are
                expected as down-stream, so they can not be left out.

        Returns:
            torch.Tensor: predicted values.
        """

        # TODO: can be improved with ** and move everything to device in a
        # single step.
        transformer_inputs = {
            'input_ids': input_ids.to(self.device),
            'masks': masks.to(self.device),
            'token_type_ids': token_type_ids.to(self.device)
            }

        # match args with transformer
        transformer_inputs = match_kwargs(self.transformer.forward, **transformer_inputs)

        outputs = self.transformer(**transformer_inputs)[0]

        # apply drop-out
        outputs = self.dropout(outputs)

        # outputs for all labels/tags
        outputs = self.tags(outputs)

        return outputs


In [None]:
"""
This section covers functionality for computing performance
for [NERDA.models.NERDA][] models.
"""

from typing import List
from sklearn.metrics import precision_recall_fscore_support
import warnings

def flatten(l: list):
    """Flattens list"""
    return [item for sublist in l for item in sublist]


def compute_f1_scores(y_pred: List[List[str]],
                      y_true: List[List[str]],
                      labels: List[str],
                      **kwargs) -> list:
    """Compute F1 scores.

    Computes F1 Scores

    Args:
        y_pred (List): predicted values.
        y_true (List): observed/true values.
        labels (List): all possible tags.
        kwargs: all optional arguments for precision/recall function.

    Returns:
        list: resulting F1 scores.

    """
    # check inputs.
    assert sum([len(t) < len(p) for t, p in zip(y_true, y_pred)]) == 0, "Length of predictions must not exceed length of observed values"

    # check, if some lengths of observed values exceed predicted values.
    n_exceeds = sum([len(t) > len(p) for t, p in zip(y_true, y_pred)])
    if n_exceeds > 0:
        warnings.warn(f'length of observed values exceeded lengths of predicted values in {n_exceeds} cases and were truncated. _Consider_ increasing max_len parameter for your model.')

    # truncate observed values dimensions to match predicted values,
    # this is needed if predictions have been truncated earlier in
    # the flow.
    y_true = [t[:len(p)] for t, p in zip(y_true, y_pred)]

    y_pred = flatten(y_pred)
    y_true = flatten(y_true)

    f1_scores = precision_recall_fscore_support(y_true = y_true,
                                                y_pred = y_pred,
                                                labels = labels,
                                                **kwargs)

    return f1_scores

In [None]:
"""
This sections covers NERDA Models that have been 'precooked' by
Ekstra Bladet and are publicly available for download.
"""
from NERDA_Con.datasets import get_dane_data, get_conll_data
from NERDA_Con.models import NERDA
import os
import urllib
from pathlib import Path
from progressbar import ProgressBar

pbar = None

# helper function to show progressbar
def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = ProgressBar(maxval=total_size)

    downloaded = block_num * block_size
    pbar.start()
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

class Precooked(NERDA):
    """Precooked NERDA Model

    NERDA model specification that has been precooked/pretrained
    and is available for download.

    Inherits from [NERDA.models.NERDA][].
    """
    def __init__(self, **kwargs) -> None:
        """Initialize Precooked NERDA Model

        Args:
            kwargs: all arguments for NERDA Model.
        """
        super().__init__(**kwargs)

    def download_network(self, dir = None) -> None:
        """Download Precooked Network from Web

        Args:
            dir (str, optional): Directory where the model file
                will be saved. Defaults to None, in which case
                the model will be saved in a folder '.nerda' in
                your home directory.

        Returns:
            str: Message saying if the download was successfull.
            Model is downloaded as a side-effect.
        """

        model_name = type(self).__name__

        # url for public S3 bucket with NERDA models.
        url_s3 = 'https://nerda.s3-eu-west-1.amazonaws.com'
        url_model = f'{url_s3}/{model_name}.bin'

        if dir is None:
            dir = os.path.join(str(Path.home()), '.nerda')

        if not os.path.exists(dir):
            os.mkdir(dir)

        file_path = os.path.join(dir, f'{model_name}.bin')

        print(
        """
        Please make sure, that you're running the latest version of 'NERDA'
        otherwise the model is not guaranteed to work.
        """
        )
        print(f'Downloading {url_model} to {file_path}')
        urllib.request.urlretrieve(url_model, file_path, show_progress)

        return "Network downloaded successfully. Load network with 'load_network'."

    def load_network(self, file_path: str = None) -> None:
        """Load Pretrained Network

        Loads pretrained network from file.

        Args:
            file_path (str, optional): Path to model file. Defaults to None,
                in which case, the function points to the '.nerda' folder
                the home directory.
        """

        model_name = type(self).__name__

        if file_path is None:
            file_path = os.path.join(str(Path.home()), '.nerda', f'{model_name}.bin')

        assert os.path.exists(file_path), "File does not exist! You can download network with download_network()"
        print(
        """
        Model loaded. Please make sure, that you're running the latest version
        of 'NERDA' otherwise the model is not guaranteed to work.
        """
        )
        self.load_network_from_file(file_path)

class DA_BERT_ML(Precooked):
    """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)
    for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Inherits from [NERDA.precooked.Precooked][].

    Examples:
        >>> from NERDA.precooked import DA_BERT_ML()
        >>> model = DA_BERT_ML()
        >>> model.download_network()
        >>> model.load_network()
        >>> text = 'Jens Hansen har en bondegård'
        >>> model.predict_text(text)
        ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])

    """
    def __init__(self, device: str = None) -> None:
        """Initialize model"""
        super().__init__(transformer = 'bert-base-multilingual-uncased',
                         device = device,
                         tag_scheme = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                         tag_outside = 'O',
                         max_len = 128,
                         dropout = 0.1,
                         hyperparameters = {'epochs' : 4,
                                            'warmup_steps' : 500,
                                            'train_batch_size': 13,
                                            'learning_rate': 0.0001},
                         tokenizer_parameters = {'do_lower_case' : True})

class DA_DISTILBERT_ML(Precooked):
    """NERDA [Multilingual BERT](https://huggingface.co/distilbert-base-multilingual-cased)
    for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Inherits from [NERDA.precooked.Precooked][].

    Examples:
        >>> from NERDA.precooked import DA_DISTILBERT_ML()
        >>> model = DA_DISTILBERT_ML()
        >>> model.download_network()
        >>> model.load_network()
        >>> text = 'Jens Hansen har en bondegård'
        >>> model.predict_text(text)
        ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])

    """
    def __init__(self, device: str = None) -> None:
        """Initialize model"""
        super().__init__(transformer = 'distilbert-base-multilingual-cased',
                         device = device,
                         tag_scheme = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                         tag_outside = 'O',
                         max_len = 128,
                         dropout = 0.1,
                         hyperparameters = {'epochs' : 4,
                                            'warmup_steps' : 500,
                                            'train_batch_size': 13,
                                            'learning_rate': 0.0001},
                         tokenizer_parameters = {'do_lower_case' : False})

class DA_ELECTRA_DA(Precooked):
    """NERDA [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased)
    for Danish finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Inherits from [NERDA.precooked.Precooked][].

    Examples:
        >>> from NERDA.precooked import DA_ELECTRA_DA()
        >>> model = DA_ELECTRA_DA()
        >>> model.download_network()
        >>> model.load_network()
        >>> text = 'Jens Hansen har en bondegård'
        >>> model.predict_text(text)
        ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])

    """
    def __init__(self, device: str = None) -> None:
        """Initialize model"""
        super().__init__(transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
                         device = device,
                         tag_scheme = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                         tag_outside = 'O',
                         max_len = 128,
                         dropout = 0.1,
                         hyperparameters = {'epochs' : 5,
                                            'warmup_steps' : 500,
                                            'train_batch_size': 13,
                                            'learning_rate': 0.0001},
                         tokenizer_parameters = {'do_lower_case' : True})

class EN_ELECTRA_EN(Precooked):
    """NERDA [English ELECTRA](https://huggingface.co/google/electra-small-discriminator)
    for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).

    Inherits from [NERDA.precooked.Precooked][].

    Examples:
        >>> from NERDA.precooked import EN_ELECTRA_EN()
        >>> model = EN_ELECTRA_EN()
        >>> model.download_network()
        >>> model.load_network()
        >>> text = 'Old MacDonald had a farm'
        >>> model.predict_text(text)
        ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])

    """
    def __init__(self, device: str = None) -> None:
        """Initialize model"""
        super().__init__(transformer = 'google/electra-small-discriminator',
                         device = device,
                         tag_scheme = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                         tag_outside = 'O',
                         max_len = 128,
                         dropout = 0.1,
                         hyperparameters = {'epochs' : 4,
                                            'warmup_steps' : 250,
                                            'train_batch_size': 13,
                                            'learning_rate': 8e-05},
                         tokenizer_parameters = {'do_lower_case' : True})


class EN_BERT_ML(Precooked):
    """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)
    for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).

    Inherits from [NERDA.precooked.Precooked][].

    Examples:
        >>> from NERDA.precooked import EN_BERT_ML()
        >>> model = EN_BERT_ML()
        >>> model.download_network()
        >>> model.load_network()
        >>> text = 'Old MacDonald had a farm'
        >>> model.predict_text(text)
        ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])

    """
    def __init__(self, device: str = None) -> None:
        """Initialize model"""
        super().__init__(transformer = 'bert-base-multilingual-uncased',
                         device = device,
                         tag_scheme = [
                            'B-PER',
                            'I-PER',
                            'B-ORG',
                            'I-ORG',
                            'B-LOC',
                            'I-LOC',
                            'B-MISC',
                            'I-MISC'
                            ],
                         tag_outside = 'O',
                         max_len = 128,
                         dropout = 0.1,
                         hyperparameters = {'epochs' : 4,
                                            'warmup_steps' : 500,
                                            'train_batch_size': 13,
                                            'learning_rate': 0.0001},
                         tokenizer_parameters = {'do_lower_case' : True})





In [None]:
"""
This section covers functionality for computing predictions
with a [NERDA.models.NERDA][] model.
"""

from NERDA_Con.preprocessing import create_dataloader
import torch
import numpy as np
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from typing import List, Callable
import transformers
import sklearn.preprocessing

def sigmoid_transform(x):
    prob = 1/(1 + np.exp(-x))
    return prob

def predict(network: torch.nn.Module,
            sentences: List[List[str]],
            transformer_tokenizer: transformers.PreTrainedTokenizer,
            transformer_config: transformers.PretrainedConfig,
            max_len: int,
            device: str,
            tag_encoder: sklearn.preprocessing.LabelEncoder,
            tag_outside: str,
            batch_size: int = 8,
            num_workers: int = 1,
            return_tensors: bool = False,
            return_confidence: bool = False,
            pad_sequences: bool = True) -> List[List[str]]:
    """Compute predictions.

    Computes predictions for a list with word-tokenized sentences
    with a `NERDA` model.

    Args:
        network (torch.nn.Module): Network.
        sentences (List[List[str]]): List of lists with word-tokenized
            sentences.
        transformer_tokenizer (transformers.PreTrainedTokenizer):
            tokenizer for transformer model.
        transformer_config (transformers.PretrainedConfig): config
            for transformer model.
        max_len (int): Maximum length of sentence after applying
            transformer tokenizer.
        device (str): Computational device.
        tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
            for Named-Entity tags.
        tag_outside (str): Special 'outside' NER tag.
        batch_size (int, optional): Batch Size for DataLoader.
            Defaults to 8.
        num_workers (int, optional): Number of workers. Defaults
            to 1.
        return_tensors (bool, optional): if True, return tensors.
        return_confidence (bool, optional): if True, return
            confidence scores for all predicted tokens. Defaults
            to False.
        pad_sequences (bool, optional): if True, pad sequences.
            Defaults to True.

    Returns:
        List[List[str]]: List of lists with predicted Entity
        tags.
    """
    # make sure, that input has the correct format.
    assert isinstance(sentences, list), "'sentences' must be a list of list of word-tokens"
    assert isinstance(sentences[0], list), "'sentences' must be a list of list of word-tokens"
    assert isinstance(sentences[0][0], str), "'sentences' must be a list of list of word-tokens"

    # set network to appropriate mode.
    network.eval()

    # fill 'dummy' tags (expected input for dataloader).
    tag_fill = [tag_encoder.classes_[0]]
    tags_dummy = [tag_fill * len(sent) for sent in sentences]

    dl = create_dataloader(sentences = sentences,
                           tags = tags_dummy,
                           transformer_tokenizer = transformer_tokenizer,
                           transformer_config = transformer_config,
                           max_len = max_len,
                           batch_size = batch_size,
                           tag_encoder = tag_encoder,
                           tag_outside = tag_outside,
                           num_workers = num_workers,
                           pad_sequences = pad_sequences)

    predictions = []
    probabilities = []
    tensors = []

    with torch.no_grad():
        for _, dl in enumerate(dl):

            outputs = network(**dl)

            # conduct operations on sentence level.
            for i in range(outputs.shape[0]):

                # extract prediction and transform.

                # find max by row.
                values, indices = outputs[i].max(dim=1)

                preds = tag_encoder.inverse_transform(indices.cpu().numpy())
                probs = values.cpu().numpy()

                if return_tensors:
                    tensors.append(outputs)

                # subset predictions for original word tokens.
                preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset]
                if return_confidence:
                    probs = [prob for prob, offset in zip(probs.tolist(), dl.get('offsets')[i]) if offset]

                # Remove special tokens ('CLS' + 'SEP').
                preds = preds[1:-1]
                if return_confidence:
                    probs = probs[1:-1]

                # make sure resulting predictions have same length as
                # original sentence.

                # TODO: Move assert statement to unit tests. Does not work
                # in boundary.
                # assert len(preds) == len(sentences[i])
                predictions.append(preds)
                if return_confidence:
                    probabilities.append(probs)

            if return_confidence:
                return predictions, probabilities

            if return_tensors:
                return tensors

    return predictions

def predict_text(network: torch.nn.Module,
                 text: str,
                 transformer_tokenizer: transformers.PreTrainedTokenizer,
                 transformer_config: transformers.PretrainedConfig,
                 max_len: int,
                 device: str,
                 tag_encoder: sklearn.preprocessing.LabelEncoder,
                 tag_outside: str,
                 batch_size: int = 8,
                 num_workers: int = 1,
                 pad_sequences: bool = True,
                 return_confidence: bool = False,
                 sent_tokenize: Callable = sent_tokenize,
                 word_tokenize: Callable = word_tokenize) -> tuple:
    """Compute Predictions for Text.

    Computes predictions for a text with `NERDA` model.
    Text is tokenized into sentences before computing predictions.

    Args:
        network (torch.nn.Module): Network.
        text (str): text to predict entities in.
        transformer_tokenizer (transformers.PreTrainedTokenizer):
            tokenizer for transformer model.
        transformer_config (transformers.PretrainedConfig): config
            for transformer model.
        max_len (int): Maximum length of sentence after applying
            transformer tokenizer.
        device (str): Computational device.
        tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
            for Named-Entity tags.
        tag_outside (str): Special 'outside' NER tag.
        batch_size (int, optional): Batch Size for DataLoader.
            Defaults to 8.
        num_workers (int, optional): Number of workers. Defaults
            to 1.
        pad_sequences (bool, optional): if True, pad sequences.
            Defaults to True.
        return_confidence (bool, optional): if True, return
            confidence scores for predicted tokens. Defaults
            to False.

    Returns:
        tuple: sentence- and word-tokenized text with corresponding
        predicted named-entity tags.
    """
    assert isinstance(text, str), "'text' must be a string."
    sentences = sent_tokenize(text)

    sentences = [word_tokenize(sentence) for sentence in sentences]

    predictions = predict(network = network,
                          sentences = sentences,
                          transformer_tokenizer = transformer_tokenizer,
                          transformer_config = transformer_config,
                          max_len = max_len,
                          device = device,
                          return_confidence = return_confidence,
                          batch_size = batch_size,
                          num_workers = num_workers,
                          pad_sequences = pad_sequences,
                          tag_encoder = tag_encoder,
                          tag_outside = tag_outside)

    return sentences, predictions


In [None]:
import torch
import warnings
import transformers
import sklearn.preprocessing

class NERDADataSetReader():
    """Generic NERDA DataSetReader"""

    def __init__(self,
                sentences: list,
                tags: list,
                transformer_tokenizer: transformers.PreTrainedTokenizer,
                transformer_config: transformers.PretrainedConfig,
                max_len: int,
                tag_encoder: sklearn.preprocessing.LabelEncoder,
                tag_outside: str,
                pad_sequences : bool = True) -> None:
        """Initialize DataSetReader

        Initializes DataSetReader that prepares and preprocesses
        DataSet for Named-Entity Recognition Task and training.

        Args:
            sentences (list): Sentences.
            tags (list): Named-Entity tags.
            transformer_tokenizer (transformers.PreTrainedTokenizer):
                tokenizer for transformer.
            transformer_config (transformers.PretrainedConfig): Config
                for transformer model.
            max_len (int): Maximum length of sentences after applying
                transformer tokenizer.
            tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
                for Named-Entity tags.
            tag_outside (str): Special Outside tag.
            pad_sequences (bool): Pad sequences to max_len. Defaults
                to True.
        """
        self.sentences = sentences
        self.tags = tags
        self.transformer_tokenizer = transformer_tokenizer
        self.max_len = max_len
        self.tag_encoder = tag_encoder
        self.pad_token_id = transformer_config.pad_token_id
        self.tag_outside_transformed = tag_encoder.transform([tag_outside])[0]
        self.pad_sequences = pad_sequences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = self.sentences[item]
        tags = self.tags[item]
        # encode tags
        tags = self.tag_encoder.transform(tags)

        # check inputs for consistancy
        assert len(sentence) == len(tags)

        input_ids = []
        target_tags = []
        tokens = []
        offsets = []

        # for debugging purposes
        # print(item)
        for i, word in enumerate(sentence):
            # bert tokenization
            wordpieces = self.transformer_tokenizer.tokenize(word)
            tokens.extend(wordpieces)
            # make room for CLS if there is an identified word piece
            if len(wordpieces)>0:
                offsets.extend([1]+[0]*(len(wordpieces)-1))
            # Extends the ner_tag if the word has been split by the wordpiece tokenizer
            target_tags.extend([tags[i]] * len(wordpieces))

        # Make room for adding special tokens (one for both 'CLS' and 'SEP' special tokens)
        # max_len includes _all_ tokens.
        if len(tokens) > self.max_len-2:
            msg = f'Sentence #{item} length {len(tokens)} exceeds max_len {self.max_len} and has been truncated'
            warnings.warn(msg)
        tokens = tokens[:self.max_len-2]
        target_tags = target_tags[:self.max_len-2]
        offsets = offsets[:self.max_len-2]

        # encode tokens for BERT
        # TO DO: prettify this.
        input_ids = self.transformer_tokenizer.convert_tokens_to_ids(tokens)
        input_ids = [self.transformer_tokenizer.cls_token_id] + input_ids + [self.transformer_tokenizer.sep_token_id]

        # fill out other inputs for model.
        target_tags = [self.tag_outside_transformed] + target_tags + [self.tag_outside_transformed]
        masks = [1] * len(input_ids)
        # set to 0, because we are not doing NSP or QA type task (across multiple sentences)
        # token_type_ids distinguishes sentences.
        token_type_ids = [0] * len(input_ids)
        offsets = [1] + offsets + [1]

        # Padding to max length
        # compute padding length
        if self.pad_sequences:
            padding_len = self.max_len - len(input_ids)
            input_ids = input_ids + ([self.pad_token_id] * padding_len)
            masks = masks + ([0] * padding_len)
            offsets = offsets + ([0] * padding_len)
            token_type_ids = token_type_ids + ([0] * padding_len)
            target_tags = target_tags + ([self.tag_outside_transformed] * padding_len)

        return {'input_ids' : torch.tensor(input_ids, dtype = torch.long),
                'masks' : torch.tensor(masks, dtype = torch.long),
                'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
                'target_tags' : torch.tensor(target_tags, dtype = torch.long),
                'offsets': torch.tensor(offsets, dtype = torch.long)}

def create_dataloader(sentences,
                      tags,
                      transformer_tokenizer,
                      transformer_config,
                      max_len,
                      tag_encoder,
                      tag_outside,
                      batch_size = 1,
                      num_workers = 1,
                      pad_sequences = True):

    if not pad_sequences and batch_size > 1:
        print("setting pad_sequences to True, because batch_size is more than one.")
        pad_sequences = True

    data_reader = NERDADataSetReader(
        sentences = sentences,
        tags = tags,
        transformer_tokenizer = transformer_tokenizer,
        transformer_config = transformer_config,
        max_len = max_len,
        tag_encoder = tag_encoder,
        tag_outside = tag_outside,
        pad_sequences = pad_sequences)
        # Don't pad sequences if batch size == 1. This improves performance.

    data_loader = torch.utils.data.DataLoader(
        data_reader, batch_size = batch_size, num_workers = num_workers
    )

    return data_loader


In [None]:
from typing import Callable

def match_kwargs(function: Callable, **kwargs) -> dict:
    """Matches Arguments with Function

    Match keywords arguments with the arguments of a function.

    Args:
        function (function): Function to match arguments for.
        kwargs: keyword arguments to match against.

    Returns:
        dict: dictionary with matching arguments and their
        respective values.

    """
    arg_count = function.__code__.co_argcount
    args = function.__code__.co_varnames[:arg_count]

    args_dict = {}
    for k, v in kwargs.items():
        if k in args:
            args_dict[k] = v

    return args_dict

In [None]:
import numpy as np
from sklearn import preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup
import random
import torch
from IPython import get_ipython

if True:
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            from tqdm.notebook import tqdm   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            from tqdm import tqdm # Terminal running IPython
        else:
            from tqdm import tqdm  # Other type (?)
    except NameError:
        from tqdm import tqdm

def train(model, data_loader, optimizer, device, scheduler, n_tags):
    """One Iteration of Training"""

    model.train()
    final_loss = 0.0

    for dl in tqdm(data_loader, total=len(data_loader)):

        optimizer.zero_grad()
        outputs = model(**dl)
        loss = compute_loss(outputs,
                            dl.get('target_tags'),
                            dl.get('masks'),
                            device,
                            n_tags)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()

    # Return average loss
    return final_loss / len(data_loader)

def validate(model, data_loader, device, n_tags):
    """One Iteration of Validation"""

    model.eval()
    final_loss = 0.0

    for dl in tqdm(data_loader, total=len(data_loader)):

        outputs = model(**dl)
        loss = compute_loss(outputs,
                            dl.get('target_tags'),
                            dl.get('masks'),
                            device,
                            n_tags)
        final_loss += loss.item()

    # Return average loss.
    return final_loss / len(data_loader)

def compute_loss(preds, target_tags, masks, device, n_tags):

    # initialize loss function.
    lfn = torch.nn.CrossEntropyLoss()

    # Compute active loss to not compute loss of paddings
    active_loss = masks.view(-1) == 1

    active_logits = preds.view(-1, n_tags)
    active_labels = torch.where(
        active_loss,
        target_tags.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target_tags)
    )

    active_labels = torch.as_tensor(active_labels, device = torch.device(device), dtype = torch.long)

    # Only compute loss on actual token predictions
    loss = lfn(active_logits, active_labels)

    return loss

def enforce_reproducibility(seed = 42) -> None:
    """Enforce Reproducibity

    Enforces reproducibility of models to the furthest
    possible extent. This is done by setting fixed seeds for
    random number generation etcetera.

    For atomic operations there is currently no simple way to
    enforce determinism, as the order of parallel operations
    is not known.

    Args:
        seed (int, optional): Fixed seed. Defaults to 42.
    """
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

def train_model(network,
                tag_encoder,
                tag_outside,
                transformer_tokenizer,
                transformer_config,
                dataset_training,
                dataset_validation,
                max_len = 128,
                train_batch_size = 16,
                validation_batch_size = 8,
                epochs = 5,
                warmup_steps = 0,
                learning_rate = 5e-5,
                device = None,
                fixed_seed = 42,
                num_workers = 1):

    if fixed_seed is not None:
        enforce_reproducibility(fixed_seed)

    # compute number of unique tags from encoder.
    n_tags = tag_encoder.classes_.shape[0]

    # prepare datasets for modelling by creating data readers and loaders
    dl_train = create_dataloader(sentences = dataset_training.get('sentences'),
                                 tags = dataset_training.get('tags'),
                                 transformer_tokenizer = transformer_tokenizer,
                                 transformer_config = transformer_config,
                                 max_len = max_len,
                                 batch_size = train_batch_size,
                                 tag_encoder = tag_encoder,
                                 tag_outside = tag_outside,
                                 num_workers = num_workers)
    dl_validate = create_dataloader(sentences = dataset_validation.get('sentences'),
                                    tags = dataset_validation.get('tags'),
                                    transformer_tokenizer = transformer_tokenizer,
                                    transformer_config = transformer_config,
                                    max_len = max_len,
                                    batch_size = validation_batch_size,
                                    tag_encoder = tag_encoder,
                                    tag_outside = tag_outside,
                                    num_workers = num_workers)

    optimizer_parameters = network.parameters()

    num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs)

    optimizer = AdamW(optimizer_parameters, lr = learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps
    )

    train_losses = []
    best_valid_loss = np.inf

    for epoch in range(epochs):

        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

        train_loss = train(network, dl_train, optimizer, device, scheduler, n_tags)
        train_losses.append(train_loss)
        valid_loss = validate(network, dl_validate, device, n_tags)

        print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")

        if valid_loss < best_valid_loss:
            best_parameters = network.state_dict()
            best_valid_loss = valid_loss

    # return best model
    network.load_state_dict(best_parameters)

    return network, train_losses, best_valid_loss

def on_task_update(task_id,fisher_dict,opt_param_dict, model, data_loader, optimizer, device, scheduler, n_tags, shared_model):
    model.train()
    optimizer.zero_grad()

    for dl in tqdm(data_loader, total=len(data_loader), desc='Computing Fisher Score and optimal parameters'):
        outputs = model(**dl)
        loss = compute_loss(outputs,
                            dl.get('target_tags'),
                            dl.get('masks'),
                            device,
                            n_tags)
        loss.backward()

    fisher_dict[task_id] = {}
    opt_param_dict[task_id] = {}

    for name,param in shared_model.named_parameters():
        try:
            opt_param_dict[task_id][name] = param.data.clone()
            fisher_dict[task_id][name] = param.grad.data.clone().pow(2)
        except:
            continue

def train_ewc(task_id,fisher_dict,opt_param_dict, model, data_loader, optimizer, device, scheduler, n_tags, shared_model, ewc_lambda):

    model.train()
    final_loss = 0.0

    for dl in tqdm(data_loader, total=len(data_loader)):

        optimizer.zero_grad()
        outputs = model(**dl)
        loss = compute_loss(outputs,
                            dl.get('target_tags'),
                            dl.get('masks'),
                            device,
                            n_tags)
        for task in range(task_id):
            for name,param in shared_model.named_parameters():
                try:
                    fisher = fisher_dict[task][name]
                    opt_param = opt_param_dict[task][name]
                    loss += (fisher * (opt_param - param).pow(2)).sum() *ewc_lambda
                except:
                    continue


        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()

    # Return average loss
    return final_loss / len(data_loader)

def train_model_new_task(network,
                tag_encoder,
                tag_outside,
                transformer_tokenizer,
                transformer_config,
                dataset_training,
                dataset_validation,
                max_len = 128,
                train_batch_size = 16,
                validation_batch_size = 8,
                epochs = 5,
                warmup_steps = 0,
                learning_rate = 5e-5,
                device = None,
                fixed_seed = 42,
                num_workers = 1,
                task_id = 0,
                fisher_dict = {},
                opt_param_dict = {},
                shared_model = None,
                ewc_lambda = 0.2):

    if fixed_seed is not None:
        enforce_reproducibility(fixed_seed)

    if shared_model is None:
        shared_model = network

    # compute number of unique tags from encoder.
    n_tags = tag_encoder.classes_.shape[0]

    # prepare datasets for modelling by creating data readers and loaders
    dl_train = create_dataloader(sentences = dataset_training.get('sentences'),
                                 tags = dataset_training.get('tags'),
                                 transformer_tokenizer = transformer_tokenizer,
                                 transformer_config = transformer_config,
                                 max_len = max_len,
                                 batch_size = train_batch_size,
                                 tag_encoder = tag_encoder,
                                 tag_outside = tag_outside,
                                 num_workers = num_workers)
    dl_validate = create_dataloader(sentences = dataset_validation.get('sentences'),
                                    tags = dataset_validation.get('tags'),
                                    transformer_tokenizer = transformer_tokenizer,
                                    transformer_config = transformer_config,
                                    max_len = max_len,
                                    batch_size = validation_batch_size,
                                    tag_encoder = tag_encoder,
                                    tag_outside = tag_outside,
                                    num_workers = num_workers)

    optimizer_parameters = network.parameters()

    num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs)

    optimizer = AdamW(optimizer_parameters, lr = learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps
    )

    train_losses = []
    best_valid_loss = np.inf

    for epoch in range(epochs):

        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

        train_loss = train_ewc(task_id,fisher_dict,opt_param_dict, network, dl_train, optimizer, device, scheduler, n_tags, shared_model, ewc_lambda)
        train_losses.append(train_loss)
        valid_loss = validate(network, dl_validate, device, n_tags)

        print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")

        if valid_loss < best_valid_loss:
            best_parameters = network.state_dict()
            best_valid_loss = valid_loss

    on_task_update(task_id,fisher_dict,opt_param_dict, network, dl_train, optimizer, device, scheduler, n_tags, shared_model)
    # return best model
    # network.load_state_dict(best_parameters)

    return network, train_losses, best_valid_loss




In [None]:
from NERDA.datasets import get_conll_data, download_conll_data
download_conll_data()
training = get_conll_data('train')
validation = get_conll_data('valid')

Reading https://data.deepai.org/conll2003.zip


In [None]:
tag_scheme = [
'B-PER',
'I-PER',
'B-ORG',
'I-ORG',
'B-LOC',
'I-LOC',
'B-MISC',
'I-MISC'
]

In [None]:
transformer = 'bert-base-multilingual-uncased'

In [None]:
# hyperparameters for network
dropout = 0.1
# hyperparameters for training
training_hyperparameters = {
'epochs' : 4,
'warmup_steps' : 500,                                                   'train_batch_size': 13,                                         'learning_rate': 0.0001
}

In [None]:
from NERDA.models import NERDA
model = NERDA(
dataset_training = training,
dataset_validation = validation,
tag_scheme = tag_scheme,
tag_outside = 'O',
transformer = transformer,
dropout = dropout,
hyperparameters = training_hyperparameters
)

Device automatically set to: cpu


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [None]:
model.train()




 Epoch 1 / 4


 88%|████████▊ | 950/1080 [4:29:53<37:18, 17.22s/it]

In [None]:
test = get_conll_data('test')

In [None]:
model.evaluate_performance(test)

In [None]:
model.predict_text('Cristiano Ronaldo plays for Juventus FC')