# Imports and Functions

In [31]:
%%capture
!pip install datasets -q
!pip install transformers -q
!pip install seqeval -q
!pip install ray[tune] -q
!pip install numpyencoder -q
!pip install wandb -q
!pip install ray==2.2.0 -q
!pip install wandb -q

In [2]:
import os
import itertools
import pathlib
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import time
from numpyencoder import NumpyEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import load_metric

In [3]:
# Dataset functions
def pandas2json(df, fname: str):
    """Convert pandas to json file
    Args:
        df (pd.DataFrame): Dataframe Object
        fname (str): file name
    """

    texts = []
    for i in range(len(df)):
        text_dict = {
            "text": df['text'].iloc[i],
            "tags": df['tags'].iloc[i]
        }
        texts.append(text_dict)

    with open(fname, 'w', encoding='utf8') as file:
        for text in texts:
            json.dump(text, file, ensure_ascii=False)
            file.write('\n')


def json2dict(fname: str, mode='r', encoding='utf8'):
    """Loads data from a json file into a dict object
    """
    with open(fname, mode, encoding=encoding) as jfile:
        data = json.load(jfile)

    return data


def dict2json(data: list, fname: str,
                sort_keys=False, indent=None):
    """Saves the data in a json file
    Args:
        data (list[dict]): data in NM format:
            {'text': str,
            'entities': list[{'start': int, 'end': int, 'label': str, 'value': str}],
            'anottation_status': str,
            'notes': str}
        fname (str): output file
    """

    with open(fname, 'w', encoding='utf8') as file:
        json.dump(data, file, ensure_ascii=False,
                    sort_keys=sort_keys, indent=indent,
                    cls=NumpyEncoder)

In [4]:
# Balancing functions
def balance_datasets(d1: list, d2: list, upper_limit=0.75,
                    balancing_range=0.2, names_list=None):
    """Balance NM NER dataset
    Args:
    d1, d2 (list[dict]): entities dict from __count_entities:
    """
    # entities in each dataset
    entities_d1, entities_d2 = (__count_entities(d1, names_list),
                                __count_entities(d2, names_list))

    __realizar_correcao(d1, d2, entities_d1, entities_d2,
                        upper_limit=upper_limit,
                        balancing_range=balancing_range)

    __remove_null(d1); __remove_null(d2)


def __count_entities(dataset, names_list=None):
    """Returns a entities dict in the format:
    {
        'names': list[str]
        'ent_count': {'name': count (int)},     # dataset-wise
        'doc_count': [{'name': count (int)}]    # element-wise
        'pos': {'name': list[int]}
    }
    """
    names, ent_count, doc_count, pos = [], {}, [], {}
    if  names_list:
        for name in names_list:
            names.append(name)
            ent_count[name] = 0
            pos[name] = []

    for idx, doc in enumerate(dataset):
        if doc is None: continue
        doc_ent_count = {k: 0 for k in names}
        for entity in doc['entities']:
            ent_name = entity['label']
            if ent_name not in names:
                names.append(ent_name)
                ent_count[ent_name] = 0
                pos[ent_name] = []
                doc_ent_count[ent_name] = 0
                for doc in doc_count: doc.update({ent_name: 0})

            ent_count[ent_name] += 1
            pos[ent_name].append(idx)
            doc_ent_count[ent_name] += 1

        doc_count.append(doc_ent_count)

    return {'names': names, 'ent_count': ent_count,
            'doc_count': doc_count, 'pos': pos}


def __transfer_entity(destination, source, idx):
    destination.append(source[idx])
    source[idx] = None


def __balance_entity(destination, source, entities_dest, entities_src,
                    qtd, entity):
    """Transfere 'qtd' documentos que contém uma entidade
    do dataset de origem (source) para o dataset de destino (destination).
    """

    qtd = abs(qtd)
    while qtd > 0:
        for idx, doc in enumerate(entities_src['doc_count']):
            if doc[entity]:
                qtd -= doc[entity]
                __transfer_entity(destination, source, idx)

                for entity_name in doc.keys():
                    entities_src['ent_count'][entity_name] -= doc[entity_name]
                    entities_dest['ent_count'][entity_name] += doc[entity_name]
                    doc[entity_name] = 0
                break


def __realizar_correcao(d1, d2, entities_d1, entities_d2, upper_limit=0.75,
                        balancing_range=0.10):
    for entity in entities_d1['names']:
        e1, e2 = entities_d1['ent_count'][entity], entities_d2['ent_count'][entity]
        percent = e1/(e1+e2)
        unit_percent = 1/(e1+e2)

        # destination = d2, source = d1
        if percent > upper_limit:
            qtd = (percent - upper_limit + balancing_range/2) / unit_percent
            __balance_entity(d2, d1, entities_d2, entities_d1, round(qtd), entity)

        # destination = d1, source = d2
        if percent < upper_limit - balancing_range:
            qtd = (upper_limit - percent - balancing_range/2) / unit_percent
            __balance_entity(d1, d2, entities_d1, entities_d2, round(qtd), entity)


def __remove_null(dataset):
    for doc in reversed(dataset):
        if doc is None:
            dataset.remove(doc)

In [5]:
# Stats functions
def get_entities_percentage(entities_d1, entities_d2, print_results=True):
    percents = [e1/(e1+e2)
    for e1, e2 in zip(entities_d1['ent_count'].values(), entities_d2['ent_count'].values())
    ]

    text = ''
    for percent, entity in zip(percents, entities_d1['names']):
        text += f'{percent}\t{entity}\n'

    if print_results:
        print(text, end='')

    return text


def get_entities_count(entities_d1, print_results=True):
    text = ''
    for count, entity in zip(entities_d1['ent_count'].values(), entities_d1['names']):
        text += f'{count} \t{entity}\n'

    if print_results:
        print(text, end='')

    return text

In [6]:
# Tokenizing functions
def get_ent_label(entity_name: str) -> int:
    label_n = 0
    if entity_name=='CABECALHO':
        label_n=1
    elif entity_name=='SUBCABECALHO':
        label_n=3
    else:
        label_n=5
    return label_n


def create_label_vector(doc, input_ids, tokenizer):
    vetor=np.zeros(512)
    for ent_dict in doc['entities']:
        ent_label = get_ent_label(ent_dict['label'])
        entidade = doc['text'][ent_dict['start'] : ent_dict['end']]
        tokenized_entity = tokenizer(entidade, is_split_into_words=False)

        for token_idx, input_id in enumerate(input_ids):
            entity_ids = tokenized_entity['input_ids']
            if entity_ids[1] == input_id:
                if entity_ids[1:-1] == input_ids[token_idx : token_idx+(len(entity_ids)-2)]:
                    vetor[token_idx] = ent_label
                    vetor[token_idx+1:token_idx+(len(entity_ids)-2)] = ent_label+1
                    break

    for idx, id in enumerate(input_ids):
        if id == 101 or id ==102:
            vetor[idx] = -100

    return vetor.tolist()


def tokenize_dataset(dataset, tokenizer, stride=0):
    tokenized_dataset = []
    for doc in dataset:
        tokenized_text = tokenizer(doc['text'], padding='max_length', truncation=True,
                                    stride = stride,
                                    max_length=512, is_split_into_words=False,
                                    return_overflowing_tokens=True,)

        for idx, _ in enumerate(tokenized_text['overflow_to_sample_mapping']):
            new_doc = {
                'input_ids': tokenized_text.input_ids[idx],
                'attention_mask': tokenized_text.attention_mask[idx],
                'labels': create_label_vector(doc, tokenized_text.input_ids[idx], tokenizer),
            }
            tokenized_dataset.append(new_doc)

    return tokenized_dataset

In [40]:
# Training class
class NM_Trainer():
    """Trainer for NM dataset.
    Expects the train and test datasets to already be tokenized and balanced.
    """
    def __init__(self,
                treino: dict,
                teste: dict,
                label_names: str,
                metric,
                entities_names: str = None,
                tokenizer = None,
                use_wandb = False,
                wandb_run_name = None,
                learning_rate=4.076831342095183e-05,
                num_train_epochs=3,
                per_device_train_batch_size=4,
                gradient_accumulation_steps=2,
                eval_accumulation_steps=2,
                ) -> None:
        self.treino = treino
        self.teste = teste
        self.metric = metric
        self.label_names = label_names
        self.entities_names = entities_names
        self.tokenizer = tokenizer
        if tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                                            do_lower_case=False)
        self.trainer = self.__get_trainer(
                learning_rate=learning_rate,
                num_train_epochs=num_train_epochs,
                per_device_train_batch_size=per_device_train_batch_size,
                gradient_accumulation_steps=gradient_accumulation_steps,
                eval_accumulation_steps=eval_accumulation_steps,
        )
        if use_wandb:
            setattr(self.trainer.args, "report_to", "wandb")
            self.__set_wandb_run_name(wandb_run_name)

    def train(self):
        return self.trainer.train()

    def return_metrics(self) -> dict:
        predictions, labels, _ = self.trainer.predict(self.teste)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        return self.metric.compute(predictions=true_predictions, references=true_labels)

    def __get_trainer(self,
                    learning_rate=4.076831342095183e-05,
                    num_train_epochs=3,
                    per_device_train_batch_size=4,
                    gradient_accumulation_steps=2,
                    eval_accumulation_steps=2,):
        def model_init():
            return AutoModelForTokenClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=7)

        def compute_metrics(p):
            predictions, labels = p
            predictions = np.argmax(predictions, axis=2)

            # Remove ignored index (special tokens)
            true_predictions = [
                [self.label_names[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            true_labels = [
                [self.label_names[l] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]

            results = self.metric.compute(predictions=true_predictions, references=true_labels)

            return {
                    "precision": results["overall_precision"],
                    "recall": results["overall_recall"],
                    "f1": results["overall_f1"],
                    #"accuracy": results["overall_accuracy"],
                    }

        data_collator = DataCollatorForTokenClassification(self.tokenizer)
        batch_size = per_device_train_batch_size
        logging_steps = len(self.treino) // batch_size
        epochs = num_train_epochs
        training_args = TrainingArguments(
            output_dir = "results",
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            evaluation_strategy = "epoch",
            metric_for_best_model = "f1",
            disable_tqdm = False,
            logging_steps = logging_steps,
            gradient_accumulation_steps = gradient_accumulation_steps,
            eval_accumulation_steps = eval_accumulation_steps,
            learning_rate = learning_rate,
        )
        trainer = Trainer(
            model_init=model_init,
            args=training_args,
            train_dataset=self.treino,
            eval_dataset=self.teste,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics
        )

        return trainer


    def __set_wandb_run_name(self, run_name: str):
        if run_name is None:
            run_name = "huggingface"

        setattr(self.trainer.args, "run_name", run_name)

In [36]:
# Pipeline
def get_trainer(
                dataset: dict,
                label_names,
                metric,
                balance=True,
                stride=256,
                tokenizer=None,
                test_size=0.25,
                random_state=42,
                balancing_upper_limit=0.75,
                balancing_range=0.40,
                entities_names=None,
                use_wandb = False,
                wandb_run_name = None,
                learning_rate=4.076831342095183e-05,
                num_train_epochs=3,
                per_device_train_batch_size=4,
                gradient_accumulation_steps=2,
                eval_accumulation_steps=2,
                ):
    #dataset
    treino, teste = train_test_split(dataset,
                                    test_size=test_size,
                                    random_state=random_state)

    #balanceamento
    if balance:
        balance_datasets(treino, teste,
                        upper_limit=balancing_upper_limit,
                        balancing_range=balancing_range,
                        names_list=entities_names)

    #tokenização
    if not tokenizer:
        tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                                    do_lower_case=False)
    treino = tokenize_dataset(treino, tokenizer,
                                stride=stride)
    teste = tokenize_dataset(teste, tokenizer,
                                stride=stride)

    trainer = NM_Trainer(treino, teste,
                        label_names=label_names,
                        metric=metric,
                        tokenizer=tokenizer,
                        use_wandb=use_wandb,
                        wandb_run_name=wandb_run_name,
                        learning_rate=4.076831342095183e-05,
                        num_train_epochs=3,
                        per_device_train_batch_size=4,
                        gradient_accumulation_steps=2,
                        eval_accumulation_steps=2,)

    return trainer

def run_test(
        dataset: dict,
        label_names,
        metric,
        balance=True,
        stride=256,
        tokenizer=None,
        test_size=0.25,
        random_state=42,
        balancing_upper_limit=0.75,
        balancing_range=0.40,
        entities_names=None,
        use_wandb = False,
        wandb_run_name = None,
        learning_rate=4.076831342095183e-05,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        eval_accumulation_steps=2,
        ):
    trainer = get_trainer(dataset=dataset,
                        label_names=label_names,
                        metric=metric,
                        balance=balance,
                        stride=stride,
                        tokenizer=tokenizer,
                        test_size=test_size,
                        random_state=random_state,
                        balancing_upper_limit=balancing_upper_limit,
                        balancing_range=balancing_range,
                        entities_names=entities_names,
                        use_wandb = use_wandb,
                        wandb_run_name = wandb_run_name,
                        learning_rate=4.076831342095183e-05,
                        num_train_epochs=3,
                        per_device_train_batch_size=4,
                        gradient_accumulation_steps=2,
                        eval_accumulation_steps=2,
                        )

    trainer.train()

    return trainer.return_metrics()


def test_with_checkpoints(params_list,
                        output_name,
                        dataset: dict,
                        label_names,
                        metric,
                        entities_names=None,
                        output_dir='checkpoints/',
                        step=0.1,
                        use_wandb=False,
                        wandb_config=None,
                        ):
    step = round(len(params_list)*step)
    checkpoints = [10] #[x for x in range(step, len(params_list)-step, step)]
    test_results = {}
    run = 0
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for parameters in params_list:
        timestr = time.strftime("%Y%m%d-%H%M%S")
        run += 1
        wandb_run_name = f"{timestr}_{run}_" + "_".join([f'{k}-{v}' for k,v in parameters.items()])

        if use_wandb:
            wandb_run = wandb.init(reinit=True, name=wandb_run_name,
                                    config=wandb_config)

        result = run_test(
            dataset=dataset,
            label_names=label_names,
            metric=metric,
            entities_names=entities_names,
            use_wandb=use_wandb,
            wandb_run_name=wandb_run_name,
            **parameters
        )

        if use_wandb:
            wandb_run.finish()

        test_results[f'run{run}'] = {
            'parameters': parameters,
            'result': result,
        }
        if run in checkpoints:
            fname = f"{output_dir}{output_name}_{timestr}_run{run}.json"
            dict2json(test_results, fname, sort_keys=False, indent=2)

    timestr = time.strftime("%Y%m%d-%H%M%S")
    dict2json(test_results, f"{output_dir}{output_name}_{timestr}_final.json",
                sort_keys=False, indent=2)

    return test_results

# Dataset and globals

In [9]:
!gdown "1XYdcOxnr-esES8bwKezTb6MYMfhQQriW"

Downloading...
From: https://drive.google.com/uc?id=1XYdcOxnr-esES8bwKezTb6MYMfhQQriW
To: /content/NM_dataset.json
100% 11.6M/11.6M [00:00<00:00, 75.4MB/s]


In [11]:
# globals
# current_dir = str(pathlib.Path(__file__).parent.resolve()) + "/"
# data_path = current_dir + "NM_dataset.json"
data_path = "NM_dataset.json"
dataset = [doc for doc in json2dict(data_path)]
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
metric = load_metric("seqeval")
entities_names = ['COMECO RECORTE', 'CABECALHO', 'SUBCABECALHO']
label_names={
    0: 'O',
    1: 'B-CABECALHO',
    2: 'I-CABECALHO',
    3: 'B-SUBCABECALHO',
    4: 'I-SUBCABECALHO',
    5: 'B-COMECO_RECORTE',
    6: 'I-COMECO_RECORTE',
}
wandb_config = {
    "project": "SWNM",
    "entity": "chinagab",
    "api_key": "7d7deda5ab99137996e34e47dc688b1d6b4d179c",
    "log_config": True
}

Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

# Running tests

In [22]:
test_hyperparameters = {
    'learning_rate': [4.076831342095183e-05],
    'num_train_epochs': [3],
    'per_device_train_batch_size': [4],
    'gradient_accumulation_steps': [2],
    'eval_accumulation_steps': [2],
}

In [23]:
params_used = [k for k in test_hyperparameters]
params_list = list(itertools.product(*(test_hyperparameters.values())))
params_list = [{k:v for k,v in zip(params_used, p)} for p in params_list]
a = 1
for k,v in test_hyperparameters.items():
  a *= len(v)
  print(k, len(v), a, v)
  
print('\nNumber of tests:', a)

learning_rate 1 1 [4.076831342095183e-05]
num_train_epochs 1 1 [3]
per_device_train_batch_size 1 1 [4]
gradient_accumulation_steps 1 1 [2]
eval_accumulation_steps 1 1 [2]

Number of tests: 1


In [24]:
for idx, par in enumerate(params_list):
    print(idx, par)

0 {'learning_rate': 4.076831342095183e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': 2}


In [None]:
test_with_checkpoints(params_list=params_list,
                        output_name='SWNM-dataset',
                        dataset=dataset,
                        label_names=label_names,
                        metric=metric,
                        entities_names=entities_names,
                        output_dir="checkpoints/",
                        step=0.05,
                        use_wandb=True,
                        wandb_config=wandb_config,)