https://towardsdatascience.com/easy-fine-tuning-of-transformers-for-named-entity-recognition-d72f2b5340e3


## Preparing dataset 

If you provide your own dataset, it must have the same structure:

It must be a dictionary
The dictionary must contain
'sentences': a list of word-tokenized sentences with one sentence per entry
'tags': a list with the corresponding named-entity tags.


https://ebanalyse.github.io/NERDA/workflow/

dataset and loading functions from https://github.com/CLARIN-PL/PolDeepNer

data source: nkjp-nested-simplified-v2.iob from git PolDeepNer/poldeepner/data

In [36]:
def load_file(data_file_path):
    ''' Function for loading data from .iob files or file with indices to such files
    :param data_file_path: path to iob
    '''

    x_data, y_data = [], []

    # Get data from iob file
    if data_file_path.endswith('.iob') or data_file_path.endswith('.tsv'):
        x_data, y_data = load_iob(data_file_path)

    return x_data, y_data


def load_iob(file_path, extra_features = False):
    """Loads data and label from a file.

    Args:
        file_path (str): path to the file.
        extra_features(bool): use dictionary features from iob

        The file format is tab-separated values.
        A blank line is required at the end of a sentence.

        For example:
        ```
        EU	B-ORG
        rejects	O
        German	B-MISC
        call	O
        to	O
        boycott	O
        British	B-MISC
        lamb	O
        .	O

        Peter	B-PER
        Blackburn	I-PER
        ...
        ```

    Returns:
        tuple(numpy array, numpy array): data and labels.

    Example:
         filename = 'conll2003/en/ner/train.txt'
         data, labels = load_data_and_labels(filename)
    """
    sents, labels = [], []
    words, tags = [], []
    with open(file_path, 'r') as f:
        for line in f:
            if "DOCSTART" in line:
                continue
            line = line.rstrip()
            if line:
                cols = line.split('\t')
                if extra_features:
                    words.append([cols[0]] + cols[3:-1])
                else:
                    words.append(cols[0])
                tags.append(cols[-1])
            else:
                sents.append(words)
                labels.append(tags)
                words, tags = [], []
        return sents, labels

In [37]:
x, y = load_file('/content/drive/MyDrive/nkjp-nested-simplified-v2.iob')

In [38]:
print(x[0])
print(y[0])

['Zatrzasnął', 'drzwi', 'od', 'mieszkania', ',', 'dwa', 'razy', 'przekręcił', 'klucz', ',', 'nacisnął', 'klamkę', ',', 'by', 'sprawdzić', ',', 'czy', 'dobrze', 'zamknięte', ',', 'zbiegł', 'po', 'schodach', ',', 'minął', 'furtkę', ',', 'także', 'ją', 'zamknął', ',', 'i', 'znalazł', 'się', 'na', 'wąskiej', 'uliczce', 'między', 'ogródkami', ',', 'gdzie', 'drzemały', 'w', 'majowym', 'słońcu', 'trójkątne', 'ciemnozielone', 'świerki', ',', 'jakich', 'nie', 'było', 'w', 'pobliżu', 'jego', 'domu', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [39]:
all_labels = []
for l in y:
  all_labels.extend(l)
print(set(all_labels))

{'I-persName#B-persName-surname', 'B-persName#B-persName-surname', 'I-placeName-settlement', 'I-placeName-bloc', 'I-placeName-country', 'I-persName', 'B-orgName#B-placeName-settlement', 'B-placeName', 'B-placeName-district', 'B-placeName-region', 'B-persName-addName', 'I-persName#I-persName-addName', 'I-orgName', 'B-time', 'I-placeName', 'I-persName#B-persName-addName', 'B-persName', 'B-placeName-country', 'B-orgName', 'B-placeName-bloc', 'I-persName#I-persName-surname', 'I-persName#I-persName-forename', 'I-placeName-district', 'B-placeName-settlement', 'I-persName-surname', 'O', 'I-time', 'B-persName-surname', 'I-orgName#I-placeName-settlement', 'B-geogName', 'B-persName#B-persName-forename', 'I-geogName', 'I-date', 'B-persName-forename', 'I-orgName#B-placeName-settlement', 'I-placeName-region', 'I-persName#B-persName-forename', 'B-date', 'B-persName#B-persName-addName', 'I-persName-forename'}


In [42]:
def transform_labels(labels_list):
  new_labels = []
  for label in labels_list:
    if label == 'O':
      new_labels.append(label)
    elif '#' in label:
      new_labels.append(label.split('#')[-1])
    else:
      new_labels.append(label)
  return new_labels

In [43]:
y_cut = [transform_labels(labels) for labels in y]

In [45]:
all_labels = []
for l in y_cut:
  all_labels.extend(l)
print(set(all_labels))

{'I-placeName-settlement', 'I-placeName-bloc', 'I-placeName-country', 'I-persName', 'B-placeName', 'B-placeName-district', 'B-placeName-region', 'B-persName-addName', 'I-orgName', 'B-time', 'I-placeName', 'B-persName', 'I-persName-addName', 'B-placeName-country', 'B-orgName', 'B-placeName-bloc', 'I-placeName-district', 'B-placeName-settlement', 'I-persName-surname', 'O', 'I-time', 'B-persName-surname', 'B-geogName', 'I-geogName', 'B-persName-forename', 'I-date', 'I-placeName-region', 'B-date', 'I-persName-forename'}


In [46]:
tag_scheme = list(set(all_labels))
tag_scheme.remove('O')

In [47]:
print(tag_scheme)

['I-placeName-settlement', 'I-placeName-bloc', 'I-placeName-country', 'I-persName', 'B-placeName', 'B-placeName-district', 'B-placeName-region', 'B-persName-addName', 'I-orgName', 'B-time', 'I-placeName', 'B-persName', 'I-persName-addName', 'B-placeName-country', 'B-orgName', 'B-placeName-bloc', 'I-placeName-district', 'B-placeName-settlement', 'I-persName-surname', 'I-time', 'B-persName-surname', 'B-geogName', 'I-geogName', 'B-persName-forename', 'I-date', 'I-placeName-region', 'B-date', 'I-persName-forename']


In [48]:
from sklearn.model_selection import train_test_split
x_train, x_rem, y_train, y_rem = train_test_split(x, y_cut, test_size=0.2, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_rem, y_rem, test_size=0.5, random_state=42)

In [49]:
class NERDAtaset:
  def __init__(self, x, y):
    self.sentences = x
    self.tags = y
  
  def get(self, attrib):
    if attrib == 'sentences':
      return self.sentences
    elif attrib == 'tags':
      return self.tags

In [50]:
train_ds = NERDAtaset(x_train, y_train)
val_ds = NERDAtaset(x_val, y_val)
test_ds = NERDAtaset(x_test, y_test)

In [51]:
print(len(train_ds.get('tags')))

68530


In [52]:
print(len(train_ds.get('sentences')))

68530


## Preparing model

In [53]:
!pip install transformers



In [54]:
!pip install NERDA



In [55]:
transformer = 'allegro/herbert-base-cased'

In [56]:
# hyperparameters for network
dropout = 0.1
# hyperparameters for training
training_hyperparameters = {'epochs' : 4, 'warmup_steps' : 500,'train_batch_size': 13, 'learning_rate': 0.0001}

In [57]:
from NERDA.models import NERDA

In [58]:
model = NERDA(dataset_training = train_ds,
              dataset_validation = val_ds,
              tag_scheme = tag_scheme, 
              tag_outside = 'O',
              transformer = transformer,
              dropout = dropout,
              hyperparameters = training_hyperparameters,
              max_len = 128)

Device automatically set to: cuda


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model.train()


 Epoch 1 / 4


 22%|██▏       | 1166/5272 [12:06<42:50,  1.60it/s]

In [None]:
model.evaluate_performance(test_ds)

In [None]:
model.save_network('nermodel2.bin')

In [24]:
%cp /content/nermodel.bin /content/drive/MyDrive/ 

In [25]:
from NERDA.precooked import Precooked

In [26]:

loaded_model = Precooked(tag_scheme = tag_scheme,
                     tag_outside = 'O',
                     transformer = transformer,
                     max_len = 128)

Device automatically set to: cuda


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
loaded_model.load_network_from_file('/content/drive/MyDrive/nermodel.bin')

'Weights for network loaded from /content/drive/MyDrive/nermodel.bin'

In [29]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [30]:
loaded_model.predict_text('Adrianna Klank będzie studiować w Edynburgu.')

([['Adrianna', 'Klank', 'będzie', 'studiować', 'w', 'Edynburgu', '.']],
 [['B-persName', 'I-persName', 'O', 'O', 'O', 'B-placeName', 'O']])

In [31]:
loaded_model.evaluate_performance(test_ds)

Unnamed: 0,Level,F1-Score,Precision,Recall
0,B-geogName,0.744,0.745989,0.742021
1,I-orgName,0.891977,0.864466,0.921296
2,B-time,0.894737,0.864407,0.927273
3,I-placeName,0.8,0.78125,0.819672
4,I-geogName,0.809353,0.845865,0.775862
5,B-persName,0.946937,0.942843,0.951066
6,I-persName,0.95357,0.955956,0.951195
7,I-date,0.977001,0.972527,0.981516
8,B-date,0.959432,0.959432,0.959432
9,B-placeName,0.921805,0.915982,0.927703
