In [91]:
!pip install transformers
# !pip install tqdm



In [0]:
#!/usr/bin/env python

# Python version of the evaluation script from CoNLL'00-

# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported

import sys
import re

from collections import defaultdict, namedtuple

ANY_SPACE = '<SPACE>'

class FormatError(Exception):
    pass

Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')

class EvalCounts(object):
    def __init__(self):
        self.correct_chunk = 0    # number of correctly identified chunks
        self.correct_tags = 0     # number of correct chunk tags
        self.found_correct = 0    # number of chunks in corpus
        self.found_guessed = 0    # number of identified chunks
        self.token_counter = 0    # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_found_correct = defaultdict(int)
        self.t_found_guessed = defaultdict(int)

def parse_args(argv):
    import argparse
    parser = argparse.ArgumentParser(
        description='evaluate tagging results using CoNLL criteria',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    arg = parser.add_argument
    arg('-b', '--boundary', metavar='STR', default='-X-',
        help='sentence boundary')
    arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
        help='character delimiting items in input')
    arg('-o', '--otag', metavar='CHAR', default='O',
        help='alternative outside tag')
    arg('file', nargs='?', default=None)
    return parser.parse_args(argv)

def parse_tag(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, '')

def evaluate(iterable, options=None):
    if options is None:
        options = parse_args([])    # use defaults

    counts = EvalCounts()
    num_features = None       # number of features per line
    in_correct = False        # currently processed chunks is correct until now
    last_correct = 'O'        # previous chunk tag in corpus
    last_correct_type = ''    # type of previously identified chunk tag
    last_guessed = 'O'        # previously identified chunk tag
    last_guessed_type = ''    # type of previous chunk tag in corpus

    for line in iterable:
        line = line.rstrip('\r\n')

        if options.delimiter == ANY_SPACE:
            features = line.split()
        else:
            features = line.split(options.delimiter)

        if num_features is None:
            num_features = len(features)
        elif num_features != len(features) and len(features) != 0:
            raise FormatError('unexpected number of features: %d (%d)' %
                              (len(features), num_features))

        if len(features) == 0 or features[0] == options.boundary:
            features = [options.boundary, 'O', 'O']
        if len(features) < 3:
            raise FormatError('unexpected number of features in line %s' % line)

        guessed, guessed_type = parse_tag(features.pop())
        correct, correct_type = parse_tag(features.pop())
        first_item = features.pop(0)

        if first_item == options.boundary:
            guessed = 'O'

        end_correct = end_of_chunk(last_correct, correct,
                                   last_correct_type, correct_type)
        end_guessed = end_of_chunk(last_guessed, guessed,
                                   last_guessed_type, guessed_type)
        start_correct = start_of_chunk(last_correct, correct,
                                       last_correct_type, correct_type)
        start_guessed = start_of_chunk(last_guessed, guessed,
                                       last_guessed_type, guessed_type)

        if in_correct:
            if (end_correct and end_guessed and
                last_guessed_type == last_correct_type):
                in_correct = False
                counts.correct_chunk += 1
                counts.t_correct_chunk[last_correct_type] += 1
            elif (end_correct != end_guessed or guessed_type != correct_type):
                in_correct = False

        if start_correct and start_guessed and guessed_type == correct_type:
            in_correct = True

        if start_correct:
            counts.found_correct += 1
            counts.t_found_correct[correct_type] += 1
        if start_guessed:
            counts.found_guessed += 1
            counts.t_found_guessed[guessed_type] += 1
        if first_item != options.boundary:
            if correct == guessed and guessed_type == correct_type:
                counts.correct_tags += 1
            counts.token_counter += 1

        last_guessed = guessed
        last_correct = correct
        last_guessed_type = guessed_type
        last_correct_type = correct_type

    if in_correct:
        counts.correct_chunk += 1
        counts.t_correct_chunk[last_correct_type] += 1

    return counts

def uniq(iterable):
  seen = set()
  return [i for i in iterable if not (i in seen or seen.add(i))]

def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed-correct, total-correct
    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
    f = 0 if p + r == 0 else 2 * p * r / (p + r)
    return Metrics(tp, fp, fn, p, r, f)

def metrics(counts):
    c = counts
    overall = calculate_metrics(
        c.correct_chunk, c.found_guessed, c.found_correct
    )
    by_type = {}
    # print(c.t_found_guessed.keys())
    # print(uniq(c.t_found_correct.keys() + c.t_found_guessed.keys()))
    # dict_keys = c.t_found_correct.copy()
    # dict_keys.update(c.t_found_guessed.keys)
    list_keys = list(c.t_found_correct.keys())
    list_keys += list(c.t_found_guessed.keys())

    for t in set(list_keys):  # uniq(c.t_found_correct.keys() + c.t_found_guessed.keys()):
        by_type[t] = calculate_metrics(
            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
        )
    return overall, by_type

def report(counts, out=None):
    if out is None:
        out = sys.stdout

    overall, by_type = metrics(counts)

    c = counts
    out.write('processed %d tokens with %d phrases; ' %
              (c.token_counter, c.found_correct))
    out.write('found: %d phrases; correct: %d.\n' %
              (c.found_guessed, c.correct_chunk))

    results_arr = []

    if c.token_counter > 0:
        out.write('accuracy: %6.2f%%; ' %
                  (100.*c.correct_tags/c.token_counter))
        out.write('precision: %6.2f%%; ' % (100.*overall.prec))
        out.write('recall: %6.2f%%; ' % (100.*overall.rec))
        out.write('FB1: %6.2f\n' % (100.*overall.fscore))

    for i, m in sorted(by_type.items()):
        out.write('%17s: ' % i)
        out.write('precision: %6.2f%%; ' % (100.*m.prec))
        out.write('recall: %6.2f%%; ' % (100.*m.rec))
        out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
        results_arr.append(100.*m.fscore)

    results_arr.append(100.*overall.fscore)
    return overall.fscore, results_arr

def end_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk ended between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    # these chunks are assumed to have length 1
    if prev_tag == ']': chunk_end = True
    if prev_tag == '[': chunk_end = True

    return chunk_end

def start_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk started between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    if prev_tag == 'S' and tag == 'E': chunk_start = True
    if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    # these chunks are assumed to have length 1
    if tag == '[': chunk_start = True
    if tag == ']': chunk_start = True

    return chunk_start

def eval_f1score(file_):

    with open(file_) as f:
        counts = evaluate(f)
    f1score, fscore_arr = report(counts)
    print(fscore_arr)

    return f1score, fscore_arr

# def main():
#     print(eval_f1score('test.txt'))
# if __name__ == '__main__':
#     main()

In [0]:
from collections import namedtuple
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
# from conlleval import eval_f1score
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel, BertModel, AdamW

In [94]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()

True

In [0]:
PATH_TRAIN = "train_dataset.txt"
PATH_VAL = "val_dataset.txt"
PATH_AQMAR_TEST = "aqmar_test_dataset.txt"
PATH_NEWS_TEST = "news_test_dataset.txt"
PATH_TWEETS_TEST = "tweets_test_dataset.txt"
FULL_FINETUNE = True

In [0]:
label_to_id = {"O":0, "B-ORG":1, "I-ORG":2, "B-PER":3, "I-PER":4, "B-LOC":5, "I-LOC":6}
id_to_label = {value: key for key, value in label_to_id.items()}

In [0]:
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)

In [0]:
def clean_label(label):
    if "B-ORG" in label:
        return "B-ORG"
    elif "I-ORG" in label:
        return "I-ORG"
    elif "B-PER" in label:
        return "B-PER"
    elif "I-PER" in label:
        return "I-PER"
    elif "B-LOC" in label:
        return "B-LOC"
    elif "I-LOC" in label:
        return "I-LOC"
    elif "O" in label:
        return "O"

In [0]:
def preprocess_data(PATH_DATASET, tokenizer, max_length=512):
    data = pd.read_csv(PATH_DATASET, encoding="utf-8", delim_whitespace=True, header=None, skip_blank_lines=False)
    Instance = namedtuple("Instance", ["tokenized_text", "input_ids", "input_mask", "labels", "label_ids"])
    dataset = []
    text = ["[CLS]"]
    labels = ["O"]
    for w, l in zip(data[0], data[1]):
        if str(w) == "nan" and str(l) == "nan":
            text.append("[SEP]")
            labels.append("O")
            
            str_text = " ".join(text)
            tokenized_text = arabert_tokenizer.tokenize(str_text)
            
            cnt = 0 
            new_labels = []
            label_ids = []
            for i in tokenized_text:
                if "##" in i:
                    tok_label = labels[cnt - 1]
                    if "B-" in tok_label:
                        tok_label = tok_label.replace("B-", "I-")
                        
                    tok_label = clean_label(tok_label)
                    new_labels.append(tok_label)
                    label_ids.append(label_to_id[tok_label])
                else:
                    new_labels.append(labels[cnt])
                    label_ids.append(label_to_id[clean_label(labels[cnt])])
                    cnt += 1
                                    
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
                
            input_mask = [1] * len(input_ids)
            
            while len(input_ids) < max_length:
                input_ids.append(0)
                input_mask.append(0)
                label_ids.append(label_to_id["O"])
            
            dataset.append(Instance(tokenized_text, input_ids,
                            input_mask, new_labels, label_ids))

            text = ["[CLS]"]
            labels = ["O"]
            continue
            
        
        text.append(str(w))
        labels.append(str(l))
        
        
    return dataset

In [0]:
def transform_to_tensors(dataset):
    tensors_input_ids = []
    tensors_input_mask = []
    tensors_label_ids = []
    for i in dataset:
        tensors_input_ids.append(i.input_ids)
        tensors_input_mask.append(i.input_mask)
        tensors_label_ids.append(i.label_ids)
        
    return torch.tensor(tensors_input_ids), torch.tensor(tensors_input_mask), torch.tensor(tensors_label_ids)

In [0]:
class ModifiedBertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=7):
        super().__init__(config)
        self.num_labels = num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs =  logits # (logits,) + outputs[2:] add hidden states and attention if they are here
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs =  loss # (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [0]:
def train(model, optimizer, train_dataloader, val_dataloader, dataset_val, accumulation_steps=32, epochs=1, device="cpu"):
    model.to(device)
    best_f1_score = 0
    best_model = None
    
    for epoch in range(epochs):
        training_loss = 0.0
        val_loss = 0.0

        model.train()
        cnt_step = 0
        for batch in tqdm(train_dataloader):
            
            input_ids, input_mask, label_ids = batch
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            label_ids = label_ids.to(device)
            
            loss = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
            training_loss += loss.data.item()
            
            loss = loss / accumulation_steps
            loss.backward()
            
            if (cnt_step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            cnt_step += 1

        training_loss /= cnt_step
        
        model.eval()
        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                input_ids, input_mask, label_ids = batch
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                label_ids = label_ids.to(device)

                loss = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
                val_loss += loss.data.item()

            val_loss /= len(val_dataloader)

            print("epoch {}: training loss {}, val loss {}".format(epoch, training_loss, val_loss))
            
        f1_score_arr = evaluatemodel(model, "val.txt", dataset_val, val_dataloader)
        
        if f1_score_arr[3] > best_f1_score:
                best_f1_score = f1_score_arr[3]
                best_model = model
                print("We have a better model with an F1 Score: {}".format(best_f1_score))
            
    return best_model

In [0]:
def evaluatemodel(model, filename, dataset, dataloader):
    global id_to_label
    model.eval()
    f1_score = 0
    
    with torch.no_grad():
        fw =  open("{}".format(filename), "w")
        cnt = 0
        for batch in tqdm(dataloader):
            input_ids, input_mask, _ = batch
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            output = model(input_ids=input_ids, attention_mask=input_mask)

            length = len(dataset[cnt].tokenized_text)
            for w in range(length):
                word = dataset[cnt].tokenized_text[w]
                true_label = clean_label(dataset[cnt].labels[w])
                pred_label = id_to_label[torch.argmax(output.squeeze(0)[w]).item()]
                fw.write("{} {} {}\n".format(word, true_label, pred_label))
            fw.write("\n")
            cnt += 1
        fw.close()

        _, f1_score_arr = eval_f1score("{}".format(filename))
        
    return f1_score_arr
    

In [0]:
arabert_model = ModifiedBertForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv01")

In [0]:
dataset_train = preprocess_data(PATH_TRAIN, arabert_tokenizer)
dataset_val = preprocess_data(PATH_VAL, arabert_tokenizer)
dataset_aqmar_test = preprocess_data(PATH_AQMAR_TEST, arabert_tokenizer)
dataset_news_test = preprocess_data(PATH_NEWS_TEST, arabert_tokenizer)
dataset_tweets_test = preprocess_data(PATH_TWEETS_TEST, arabert_tokenizer)

In [0]:
train_tensors_input_ids, train_tensors_input_mask, train_tensors_label_ids = transform_to_tensors(dataset_train)
val_tensors_input_ids, val_tensors_input_mask, val_tensors_label_ids = transform_to_tensors(dataset_val)
test_aqmar_tensors_input_ids, test_aqmar_tensors_input_mask, test_aqmar_tensors_label_ids = transform_to_tensors(dataset_aqmar_test)
test_news_tensors_input_ids, test_news_tensors_input_mask, test_news_tensors_label_ids = transform_to_tensors(dataset_news_test)
test_tweets_tensors_input_ids, test_tweets_tensors_input_mask, test_tweets_tensors_label_ids = transform_to_tensors(dataset_tweets_test)

In [0]:
train_tensor_dataset = TensorDataset(train_tensors_input_ids, train_tensors_input_mask, train_tensors_label_ids)
val_tensor_dataset = TensorDataset(val_tensors_input_ids, val_tensors_input_mask, val_tensors_label_ids)
test_aqmar_tensor_dataset = TensorDataset(test_aqmar_tensors_input_ids, test_aqmar_tensors_input_mask, test_aqmar_tensors_label_ids)
test_news_tensor_dataset = TensorDataset(test_news_tensors_input_ids, test_news_tensors_input_mask, test_news_tensors_label_ids)
test_tweets_tensor_dataset = TensorDataset(test_tweets_tensors_input_ids, test_tweets_tensors_input_mask, test_tweets_tensors_label_ids)

In [0]:
train_dataloader = DataLoader(train_tensor_dataset, batch_size=1)
val_dataloader = DataLoader(val_tensor_dataset, batch_size=1)
test_aqmar_dataloader = DataLoader(test_aqmar_tensor_dataset, batch_size=1)
test_news_dataloader = DataLoader(test_news_tensor_dataset, batch_size=1)
test_tweets_dataloader = DataLoader(test_tweets_tensor_dataset, batch_size=1)

In [109]:
optimizer_grouped_parameters = None
param_optimizer = list(arabert_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

if FULL_FINETUNE:
    print('ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    print('NO ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': arabert_model.classifier.parameters(),
         'weight_decay_rate': 0.01}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

ALL FINETUNE


In [110]:
trained_model = train(arabert_model, optimizer, train_dataloader, val_dataloader, dataset_val, epochs=1, device=device)


  0%|          | 0/4147 [00:00<?, ?it/s][A
  0%|          | 2/4147 [00:00<06:59,  9.89it/s][A
  0%|          | 3/4147 [00:00<07:51,  8.80it/s][A
  0%|          | 4/4147 [00:00<08:19,  8.30it/s][A
  0%|          | 5/4147 [00:00<08:38,  7.99it/s][A
  0%|          | 6/4147 [00:00<08:50,  7.81it/s][A
  0%|          | 7/4147 [00:00<08:59,  7.67it/s][A
  0%|          | 8/4147 [00:01<09:03,  7.61it/s][A
  0%|          | 9/4147 [00:01<09:09,  7.53it/s][A
  0%|          | 10/4147 [00:01<09:14,  7.46it/s][A
  0%|          | 11/4147 [00:01<09:14,  7.46it/s][A
  0%|          | 12/4147 [00:01<09:14,  7.45it/s][A
  0%|          | 13/4147 [00:01<09:17,  7.42it/s][A
  0%|          | 14/4147 [00:01<09:16,  7.43it/s][A
  0%|          | 15/4147 [00:01<09:16,  7.42it/s][A
  0%|          | 16/4147 [00:02<09:17,  7.41it/s][A
  0%|          | 17/4147 [00:02<09:21,  7.36it/s][A
  0%|          | 18/4147 [00:02<09:18,  7.40it/s][A
  0%|          | 19/4147 [00:02<09:17,  7.41it/s][A
  0%|    

epoch 0: training loss 0.26715418260352747, val loss 0.11117234507248429



  0%|          | 6/2556 [00:00<02:00, 21.21it/s][A
  0%|          | 9/2556 [00:00<01:59, 21.23it/s][A
  0%|          | 12/2556 [00:00<02:00, 21.14it/s][A
  1%|          | 15/2556 [00:00<02:00, 21.14it/s][A
  1%|          | 18/2556 [00:00<02:00, 21.10it/s][A
  1%|          | 21/2556 [00:00<02:00, 21.05it/s][A
  1%|          | 24/2556 [00:01<02:00, 20.94it/s][A
  1%|          | 27/2556 [00:01<02:02, 20.71it/s][A
  1%|          | 30/2556 [00:01<02:01, 20.72it/s][A
  1%|▏         | 33/2556 [00:01<02:02, 20.63it/s][A
  1%|▏         | 36/2556 [00:01<02:01, 20.71it/s][A
  2%|▏         | 39/2556 [00:01<02:01, 20.80it/s][A
  2%|▏         | 42/2556 [00:02<02:00, 20.87it/s][A
  2%|▏         | 45/2556 [00:02<02:00, 20.85it/s][A
  2%|▏         | 48/2556 [00:02<02:00, 20.88it/s][A
  2%|▏         | 51/2556 [00:02<01:59, 20.94it/s][A
  2%|▏         | 54/2556 [00:02<01:59, 20.87it/s][A
  2%|▏         | 57/2556 [00:02<01:59, 20.88it/s][A
  2%|▏         | 60/2556 [00:02<01:59, 20.81it/

processed 94170 tokens with 3118 phrases; found: 3255 phrases; correct: 2350.
accuracy:  97.08%; precision:  72.20%; recall:  75.37%; FB1:  73.75
              LOC: precision:  80.49%; recall:  94.60%; FB1:  86.97  1153
              ORG: precision:  53.26%; recall:  51.75%; FB1:  52.50  582
              PER: precision:  73.16%; recall:  72.30%; FB1:  72.73  1520
[86.97282099343954, 52.49788314987298, 72.72727272727273, 73.74862702024164]
We have a better model with an F1 Score: 73.74862702024164


In [111]:
evaluatemodel(trained_model, "val.txt", dataset_val, val_dataloader)
evaluatemodel(trained_model, "test_aqmar.txt", dataset_aqmar_test, test_aqmar_dataloader)
evaluatemodel(trained_model, "test_news.txt", dataset_news_test, test_news_dataloader)
evaluatemodel(trained_model, "test_tweets.txt", dataset_tweets_test, test_tweets_dataloader)


  0%|          | 0/2556 [00:00<?, ?it/s][A
  0%|          | 3/2556 [00:00<02:00, 21.16it/s][A
  0%|          | 6/2556 [00:00<02:00, 21.18it/s][A
  0%|          | 9/2556 [00:00<02:00, 21.20it/s][A
  0%|          | 12/2556 [00:00<02:00, 21.14it/s][A
  1%|          | 15/2556 [00:00<02:00, 21.09it/s][A
  1%|          | 18/2556 [00:00<02:00, 21.10it/s][A
  1%|          | 21/2556 [00:00<02:01, 20.91it/s][A
  1%|          | 24/2556 [00:01<02:01, 20.87it/s][A
  1%|          | 27/2556 [00:01<02:01, 20.77it/s][A
  1%|          | 30/2556 [00:01<02:01, 20.76it/s][A
  1%|▏         | 33/2556 [00:01<02:02, 20.64it/s][A
  1%|▏         | 36/2556 [00:01<02:01, 20.68it/s][A
  2%|▏         | 39/2556 [00:01<02:00, 20.80it/s][A
  2%|▏         | 42/2556 [00:02<02:00, 20.87it/s][A
  2%|▏         | 45/2556 [00:02<02:00, 20.91it/s][A
  2%|▏         | 48/2556 [00:02<01:59, 20.92it/s][A
  2%|▏         | 51/2556 [00:02<01:59, 20.95it/s][A
  2%|▏         | 54/2556 [00:02<01:59, 20.93it/s][A
  2%

processed 94170 tokens with 3118 phrases; found: 3255 phrases; correct: 2350.
accuracy:  97.08%; precision:  72.20%; recall:  75.37%; FB1:  73.75
              LOC: precision:  80.49%; recall:  94.60%; FB1:  86.97  1153
              ORG: precision:  53.26%; recall:  51.75%; FB1:  52.50  582
              PER: precision:  73.16%; recall:  72.30%; FB1:  72.73  1520
[86.97282099343954, 52.49788314987298, 72.72727272727273, 73.74862702024164]



  0%|          | 6/2456 [00:00<01:54, 21.31it/s][A
  0%|          | 9/2456 [00:00<01:55, 21.19it/s][A
  0%|          | 12/2456 [00:00<01:55, 21.20it/s][A
  1%|          | 15/2456 [00:00<01:54, 21.26it/s][A
  1%|          | 18/2456 [00:00<01:54, 21.21it/s][A
  1%|          | 21/2456 [00:00<01:55, 21.16it/s][A
  1%|          | 24/2456 [00:01<01:54, 21.16it/s][A
  1%|          | 27/2456 [00:01<01:54, 21.23it/s][A
  1%|          | 30/2456 [00:01<01:54, 21.28it/s][A
  1%|▏         | 33/2456 [00:01<01:53, 21.33it/s][A
  1%|▏         | 36/2456 [00:01<01:53, 21.24it/s][A
  2%|▏         | 39/2456 [00:01<01:53, 21.22it/s][A
  2%|▏         | 42/2456 [00:01<01:54, 21.10it/s][A
  2%|▏         | 45/2456 [00:02<01:55, 20.95it/s][A
  2%|▏         | 48/2456 [00:02<01:54, 21.03it/s][A
  2%|▏         | 51/2456 [00:02<01:53, 21.16it/s][A
  2%|▏         | 54/2456 [00:02<01:55, 20.82it/s][A
  2%|▏         | 57/2456 [00:02<01:55, 20.84it/s][A
  2%|▏         | 60/2456 [00:02<01:53, 21.07it/

processed 88841 tokens with 2886 phrases; found: 2850 phrases; correct: 1748.
accuracy:  95.43%; precision:  61.33%; recall:  60.57%; FB1:  60.95
              LOC: precision:  70.37%; recall:  60.73%; FB1:  65.19  1090
              ORG: precision:  22.68%; recall:  38.80%; FB1:  28.63  626
              PER: precision:  73.99%; recall:  66.75%; FB1:  70.18  1134
[65.19337016574586, 28.62903225806452, 70.17984107068172, 60.94839609483961]



  2%|▏         | 5/292 [00:00<00:13, 20.72it/s][A
  2%|▏         | 7/292 [00:00<00:13, 20.41it/s][A
  3%|▎         | 10/292 [00:00<00:13, 20.52it/s][A
  4%|▍         | 13/292 [00:00<00:13, 20.50it/s][A
  5%|▌         | 16/292 [00:00<00:13, 20.64it/s][A
  7%|▋         | 19/292 [00:00<00:13, 20.71it/s][A
  8%|▊         | 22/292 [00:01<00:13, 20.59it/s][A
  8%|▊         | 24/292 [00:01<00:13, 20.36it/s][A
  9%|▉         | 27/292 [00:01<00:12, 20.50it/s][A
 10%|█         | 30/292 [00:01<00:12, 20.38it/s][A
 11%|█▏        | 33/292 [00:01<00:12, 20.52it/s][A
 12%|█▏        | 36/292 [00:01<00:12, 20.58it/s][A
 13%|█▎        | 39/292 [00:01<00:12, 20.75it/s][A
 14%|█▍        | 42/292 [00:02<00:12, 20.51it/s][A
 15%|█▌        | 45/292 [00:02<00:11, 20.60it/s][A
 16%|█▋        | 48/292 [00:02<00:11, 20.42it/s][A
 17%|█▋        | 51/292 [00:02<00:11, 20.44it/s][A
 18%|█▊        | 54/292 [00:02<00:11, 20.31it/s][A
 20%|█▉        | 57/292 [00:02<00:11, 20.48it/s][A
 21%|██      

processed 17655 tokens with 1195 phrases; found: 1177 phrases; correct: 872.
accuracy:  94.64%; precision:  74.09%; recall:  72.97%; FB1:  73.52
              LOC: precision:  79.53%; recall:  73.63%; FB1:  76.46  337
              ORG: precision:  53.98%; recall:  51.84%; FB1:  52.89  339
              PER: precision:  84.03%; recall:  88.08%; FB1:  86.01  501
[76.46219686162625, 52.89017341040463, 86.0061287027579, 73.52445193929175]



  1%|          | 6/982 [00:00<00:45, 21.48it/s][A
  1%|          | 9/982 [00:00<00:45, 21.30it/s][A
  1%|          | 12/982 [00:00<00:45, 21.32it/s][A
  2%|▏         | 15/982 [00:00<00:45, 21.35it/s][A
  2%|▏         | 18/982 [00:00<00:45, 21.33it/s][A
  2%|▏         | 21/982 [00:00<00:45, 21.33it/s][A
  2%|▏         | 24/982 [00:01<00:44, 21.31it/s][A
  3%|▎         | 27/982 [00:01<00:45, 21.22it/s][A
  3%|▎         | 30/982 [00:01<00:45, 21.15it/s][A
  3%|▎         | 33/982 [00:01<00:44, 21.10it/s][A
  4%|▎         | 36/982 [00:01<00:44, 21.08it/s][A
  4%|▍         | 39/982 [00:01<00:44, 21.22it/s][A
  4%|▍         | 42/982 [00:01<00:44, 21.21it/s][A
  5%|▍         | 45/982 [00:02<00:44, 21.28it/s][A
  5%|▍         | 48/982 [00:02<00:44, 21.15it/s][A
  5%|▌         | 51/982 [00:02<00:43, 21.19it/s][A
  5%|▌         | 54/982 [00:02<00:43, 21.15it/s][A
  6%|▌         | 57/982 [00:02<00:43, 21.07it/s][A
  6%|▌         | 60/982 [00:02<00:43, 21.17it/s][A
  6%|▋       

processed 22133 tokens with 513 phrases; found: 575 phrases; correct: 290.
accuracy:  95.06%; precision:  50.43%; recall:  56.53%; FB1:  53.31
              LOC: precision:  63.19%; recall:  49.52%; FB1:  55.53  163
              ORG: precision:  31.71%; recall:  39.00%; FB1:  34.98  123
              PER: precision:  51.21%; recall:  72.20%; FB1:  59.92  289
[55.52560646900269, 34.97757847533633, 59.91902834008097, 53.30882352941176]


[55.52560646900269, 34.97757847533633, 59.91902834008097, 53.30882352941176]