# Imports and Functions

In [None]:
# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

In [None]:
!pip install datasets -q
!pip install numpyencoder
!pip install seqeval -q
!pip install tokenizers -q
!pip install transformers -q
#!pip install optuna -q
!pip install -Uq ray[tune] wandb

In [None]:
import os
import itertools
import pathlib
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import time
from numpyencoder import NumpyEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import load_metric
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.logger import DEFAULT_LOGGERS
from ray import tune
from ray.tune.integration.wandb import WandbLoggerCallback
import wandb

In [None]:
# Dataset functions
def pandas2json(df, fname: str):
    """Convert pandas to json file
    Args:
        df (pd.DataFrame): Dataframe Object
        fname (str): file name
    """

    texts = []
    for i in range(len(df)):
        text_dict = {
            "text": df['text'].iloc[i],
            "tags": df['tags'].iloc[i]
        }
        texts.append(text_dict)

    with open(fname, 'w', encoding='utf8') as file:
        for text in texts:
            json.dump(text, file, ensure_ascii=False)
            file.write('\n')


def json2dict(fname: str, mode='r', encoding='utf8'):
    """Loads data from a json file into a dict object
    """
    with open(fname, mode, encoding=encoding) as jfile:
        data = json.load(jfile)

    return data


def dict2json(data: list, fname: str,
                sort_keys=False, indent=None):
    """Saves the data in a json file
    Args:
        data (list[dict]): data in NM format:
            {'text': str,
            'entities': list[{'start': int, 'end': int, 'label': str, 'value': str}],
            'anottation_status': str,
            'notes': str}
        fname (str): output file
    """

    with open(fname, 'w', encoding='utf8') as file:
        json.dump(data, file, ensure_ascii=False,
                    sort_keys=sort_keys, indent=indent,
                    cls=NumpyEncoder)

In [None]:
# Tokenizing functions
def get_ent_label(entity_name: str) -> int:
    label_n = 0
    if entity_name=='CABECALHO':
        label_n=1
    elif entity_name=='SUBCABECALHO':
        label_n=3
    else:
        label_n=5
    return label_n


def create_label_vector(doc, input_ids, tokenizer):
    vetor=np.zeros(512)
    for ent_dict in doc['entities']:
        ent_label = get_ent_label(ent_dict['label'])
        entidade = doc['text'][ent_dict['start'] : ent_dict['end']]
        tokenized_entity = tokenizer(entidade, is_split_into_words=False)

        for token_idx, input_id in enumerate(input_ids):
            entity_ids = tokenized_entity['input_ids']
            if entity_ids[1] == input_id:
                if entity_ids[1:-1] == input_ids[token_idx : token_idx+(len(entity_ids)-2)]:
                    vetor[token_idx] = ent_label
                    vetor[token_idx+1:token_idx+(len(entity_ids)-2)] = ent_label+1
                    break

    for idx, id in enumerate(input_ids):
        if id == 101 or id ==102:
            vetor[idx] = -100

    return vetor.tolist()


def tokenize_dataset(dataset, tokenizer, stride=0):
    tokenized_dataset = []
    for doc in dataset:
        tokenized_text = tokenizer(doc['text'], padding='max_length', truncation=True,
                                    stride = stride,
                                    max_length=512, is_split_into_words=False,
                                    return_overflowing_tokens=True,)

        for idx, _ in enumerate(tokenized_text['overflow_to_sample_mapping']):
            new_doc = {
                'input_ids': tokenized_text.input_ids[idx],
                'attention_mask': tokenized_text.attention_mask[idx],
                'labels': create_label_vector(doc, tokenized_text.input_ids[idx], tokenizer),
            }
            tokenized_dataset.append(new_doc)

    return tokenized_dataset

# Dataset and globals

In [None]:
#!gdown "1XYdcOxnr-esES8bwKezTb6MYMfhQQriW" # Full dataset
!gdown "11Je1XKm0xsJE1K6sF0a7wm2FH3UWj2up" # Train
!gdown "1qJvpBiCBQKTW3zka5S5VrMPG_n2eolZY" # Dev

Downloading...
From: https://drive.google.com/uc?id=11Je1XKm0xsJE1K6sF0a7wm2FH3UWj2up
To: /content/NM_dataset-train.json
100% 8.66M/8.66M [00:00<00:00, 30.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1qJvpBiCBQKTW3zka5S5VrMPG_n2eolZY
To: /content/NM_dataset-dev.json
100% 2.91M/2.91M [00:00<00:00, 137MB/s]


In [None]:
# globals
current_dir = str(pathlib.Path('NM_dataset-train.json').parent.resolve()) + "/"
train_path = current_dir + "NM_dataset-train.json"
dev_path = current_dir + "NM_dataset-dev.json"
treino = [doc for doc in json2dict(train_path)]
teste = [doc for doc in json2dict(dev_path)]
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
metric = load_metric("seqeval")
entities_names = ['COMECO RECORTE', 'CABECALHO', 'SUBCABECALHO']
label_names={
    0: 'O',
    1: 'B-CABECALHO',
    2: 'I-CABECALHO',
    3: 'B-SUBCABECALHO',
    4: 'I-SUBCABECALHO',
    5: 'B-COMECO_RECORTE',
    6: 'I-COMECO_RECORTE',
}
stride = 256

# Trainer

In [None]:
def return_metrics(trainer) -> dict:
    predictions, labels, _ = trainer.predict(teste)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [trainer.label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [trainer.label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return trainer.metric.compute(predictions=true_predictions, references=true_labels)

    
def model_init():
    return AutoModelForTokenClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=7)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            #"accuracy": results["overall_accuracy"],
            }


In [None]:
# Tokenização
treino = tokenize_dataset(treino, tokenizer,
                            stride=stride)
teste = tokenize_dataset(teste, tokenizer,
                            stride=stride)

data_collator = DataCollatorForTokenClassification(tokenizer)

hyperparameters={
    'learning_rate': 4.076831342095183e-05,
    'num_train_epochs': 3,
    'per_device_train_batch_size': 4
}

batch_size = hyperparameters['per_device_train_batch_size']
logging_steps = len(treino) // batch_size
epochs = hyperparameters['num_train_epochs']

training_args = TrainingArguments(
    output_dir = "results",
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    evaluation_strategy = "epoch",
    metric_for_best_model = "f1",
    disable_tqdm = False,
    logging_steps = logging_steps,
    gradient_accumulation_steps = 2,
    eval_accumulation_steps = 2,
    learning_rate = hyperparameters['learning_rate'],
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=treino,
    eval_dataset=teste,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# PBT

In [None]:
def my_objective(metrics):
    return metrics["eval_f1"]

pbt_scheduler = PopulationBasedTraining(
    time_attr='training_iteration',
    metric=my_objective,
    mode='max',
    perturbation_interval=600.0,
    hyperparam_mutations={
        "learning_rate": tune.loguniform(6e-6, 1e-3),
        "num_train_epochs": tune.choice(range(5, 15)),
        # "seed": tune.choice(range(1, 41)),
        "per_device_train_batch_size": tune.choice([4, 8, 16]),
    })

wandb.init(reinit=True,config={
    "project": "PBT_Optimization_Project",
    "entity": "chinagab",
    "api_key": "7d7deda5ab99137996e34e47dc688b1d6b4d179c",
    "log_config": True
})

best_trial = trainer.hyperparameter_search(
    backend="ray",
    direction="maximize",
    keep_checkpoints_num=1,
    scheduler=pbt_scheduler,
)

In [None]:
trainer.args

In [None]:
best_trial

In [None]:
hyperparameters={'learning_rate': 4.076831342095183e-05, 
                 'num_train_epochs': 12, 
                 'per_device_train_batch_size': 4}

In [None]:
for n, v in hyperparameters.items():
    print (n)
    setattr(trainer.args, n, v)

trainer.train()

In [None]:
predictions, labels, _ = trainer.predict(teste)

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [

[label_names[p] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]

true_labels = [

[label_names[l] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]
print(type(true_predictions))
results = metric.compute(predictions=true_predictions, references=true_labels)

results