https://towardsdatascience.com/easy-fine-tuning-of-transformers-for-named-entity-recognition-d72f2b5340e3


In [None]:
import gdown

url = 'https://drive.google.com/file/d/1s0uwihCFipTyA69Dhr_wA81cKGcQV2FA/view?usp=sharing'

output = 'nkjp-nested-simplified-v2.iob'

gdown.download(url, output, quiet=False)

In [None]:
!pip install gdown 

## Preparing dataset 

If you provide your own dataset, it must have the same structure:

It must be a dictionary
The dictionary must contain
'sentences': a list of word-tokenized sentences with one sentence per entry
'tags': a list with the corresponding named-entity tags.


https://ebanalyse.github.io/NERDA/workflow/

dataset and loading functions from https://github.com/CLARIN-PL/PolDeepNer

data source: nkjp-nested-simplified-v2.iob from git PolDeepNer/poldeepner/data

In [14]:
def load_file(data_file_path):
    ''' Function for loading data from .iob files or file with indices to such files
    :param data_file_path: path to iob
    '''

    x_data, y_data = [], []

    # Get data from iob file
    if data_file_path.endswith('.iob') or data_file_path.endswith('.tsv'):
        x_data, y_data = load_iob(data_file_path)

    return x_data, y_data


def load_iob(file_path, extra_features = False):
    """Loads data and label from a file.

    Args:
        file_path (str): path to the file.
        extra_features(bool): use dictionary features from iob

        The file format is tab-separated values.
        A blank line is required at the end of a sentence.

        For example:
        ```
        EU	B-ORG
        rejects	O
        German	B-MISC
        call	O
        to	O
        boycott	O
        British	B-MISC
        lamb	O
        .	O

        Peter	B-PER
        Blackburn	I-PER
        ...
        ```

    Returns:
        tuple(numpy array, numpy array): data and labels.

    Example:
         filename = 'conll2003/en/ner/train.txt'
         data, labels = load_data_and_labels(filename)
    """
    sents, labels = [], []
    words, tags = [], []
    with open(file_path, 'r') as f:
        for line in f:
            if "DOCSTART" in line:
                continue
            line = line.rstrip()
            if line:
                cols = line.split('\t')
                if extra_features:
                    words.append([cols[0]] + cols[3:-1])
                else:
                    words.append(cols[0])
                tags.append(cols[-1])
            else:
                sents.append(words)
                labels.append(tags)
                words, tags = [], []
        return sents, labels

In [16]:
x, y = load_file('../input/nkjp-ner/nkjp-nested-simplified-v2.iob')

In [17]:
print(x[0])
print(y[0])

In [18]:
all_labels = []
for l in y:
  all_labels.extend(l)
print(set(all_labels))

In [19]:
"""
def transform_labels(labels_list):
  new_labels = []
  for label in labels_list:
    if label == 'O':
      new_labels.append(label)
    elif '#' in label:
      new_labels.append(label.split('#')[0])
    else:
      new_labels.append(label.split('-')[0] + '-' + label.split('-')[1])
  return new_labels

"""
def transform_labels(labels_list):
  new_labels = []
  for label in labels_list:
    if label == 'O':
      new_labels.append(label)
    elif '#' in label:
      new_labels.append(label.split('#')[-1])
    else:
      new_labels.append(label)
  return new_labels

In [20]:
y_cut = [transform_labels(labels) for labels in y]

In [21]:
all_labels = []
for l in y_cut:
  all_labels.extend(l)
print(set(all_labels))

In [22]:
tag_scheme = list(set(all_labels))
tag_scheme.remove('O')

In [23]:
print(tag_scheme)

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_rem, y_train, y_rem = train_test_split(x, y_cut, test_size=0.1, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_rem, y_rem, test_size=0.5, random_state=42)

In [25]:
class NERDAtaset:
  def __init__(self, x, y):
    self.sentences = x
    self.tags = y
  
  def get(self, attrib):
    if attrib == 'sentences':
      return self.sentences
    elif attrib == 'tags':
      return self.tags

In [26]:
train_ds = NERDAtaset(x_train, y_train)
val_ds = NERDAtaset(x_val, y_val)
test_ds = NERDAtaset(x_test, y_test)

In [27]:
print(len(train_ds.get('tags')))

In [28]:
print(len(train_ds.get('sentences')))

## Preparing model

In [29]:
!pip install transformers

In [30]:
!pip install NERDA

In [32]:
transformer = 'allegro/herbert-base-cased'

In [33]:
# hyperparameters for network
dropout = 0.1
# hyperparameters for training

training_hyperparameters = {'epochs' : 4,'warmup_steps' : 500,'train_batch_size': 64,'learning_rate': 0.0001}

In [34]:
from NERDA.models import NERDA

In [35]:
model = NERDA(dataset_training = train_ds,
              dataset_validation = val_ds,
              tag_scheme = tag_scheme, 
              tag_outside = 'O',
              transformer = transformer,
              dropout = dropout,
              hyperparameters = training_hyperparameters,
              max_len = 128)

In [36]:
model.train()

In [37]:
model.evaluate_performance(test_ds)

In [38]:
model.predict_text('Ameryka jest kontynentem')

In [42]:
model.save_network('/kaggle/working/nermodel.bin')

In [None]:
!pip install bs4

In [None]:
!pip install nltk

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')


In [None]:
def scrap_and_label(num_articles):
    # model = load_test_model()
    counter = 0
    file_output = 'labeled_wiki.txt'
    with open(file_output, 'w') as file_pointer:
        while counter < num_articles:
            url = requests.get('https://pl.wikipedia.org/wiki/Specjalna:Losowa_strona')
            soup = BeautifulSoup(url.content, "html.parser")

            for paragraph in soup.find_all('p'):
              
                text = paragraph.text
                sent_text = nltk.sent_tokenize(text)  # this gives us a list of sentences
                text, labels = model.predict_text(sent_text)
                for i in range(len(text)):
                    file_pointer.write('{} {}'.format(text[i], labels[i]))


            file_pointer.write('\n')
            counter += 1

In [None]:
scrap_and_label(1000)