<a href="https://colab.research.google.com/github/FabianaAndrade/ner-pii/blob/main/treino_ner_pii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers seqeval faker

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faker
  Downloading faker-37.11.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.11.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=f62af457023ea918ca0c1006cb6329c45a9a11936f5ce6c7d7cd094033853294
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: faker, seqeval
Successfully installed faker-37.11.0 seqeval-1.2.2


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from sklearn import preprocessing
from seqeval.metrics import precision_score, recall_score, f1_score

In [None]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print("device: ", device)

device:  cuda:0


In [None]:
df_names = pd.read_csv('/content/all-pt-br-names.csv')
coluna = df_names.columns[1]
df_names[coluna].dropna().to_csv("nomes.txt", index=False, header=False)

In [None]:
print("numero de exemplos: ", len(df_names))

numero de exemplos:  103453


In [None]:
def load_lexicon(path):
    with open(path, encoding="utf-8") as f:
        return [l.strip() for l in f if l.strip()]
sobrenomes = load_lexicon("/content/portuguese_surnames.txt")
nomes = load_lexicon("/content/nomes.txt")

In [None]:
import random
import json
from faker import Faker

NUM_EXAMPLES = 5000
VAL_RATIO = 0.1
OUTPUT_TRAIN = "train.json"
OUTPUT_VAL = "validation.json"

fake = Faker("pt_BR")

TEMPLATES = [
    "Ontem encontrei {nome}.",
    "O {nome} foi ao mercado.",
    "A {nome} chegou cedo.",
    "Hoje {nome} está no trabalho.",
    "Você conhece {nome}?",
    "{nome} comprou um carro novo.",
    "Falei com {nome} ontem.",
    "O relatório foi entregue por {nome}.",
    "A mãe de {nome} chegou agora.",
    "Ontem vi {nome} no shopping.",
    "{nome} e {nome2} viajaram juntos.",
    "Fui ao cinema com {nome}.",
    "O {nome} e {nome2} participaram da reunião.",
    "{nome} mora em {cidade}.",
    "{nome} enviou uma mensagem para {nome2} ontem."
]

def tokenize(sentence):
    return sentence.replace(".", " .").replace(",", " ,").split()

dataset = []

for _ in range(NUM_EXAMPLES):
    nome = random.choice(nomes).title()
    if random.random() < 0.5:
        nome += " " + random.choice(sobrenomes).title()

    nome2 = None
    if random.random() < 0.3:
        nome2 = random.choice(nomes).title()
        if random.random() < 0.5:
            nome2 += " " + random.choice(sobrenomes).title()

    cidade = fake.city()

    template = random.choice(TEMPLATES)
    sentence = template.format(nome=nome, nome2=nome2 or "", cidade=cidade)
    sentence = " ".join(sentence.split())
    tokens = tokenize(sentence)

    tags = ["O"] * len(tokens)

    nome_tokens = nome.split()
    try:
        start_idx = tokens.index(nome_tokens[0])
        for i, t in enumerate(nome_tokens):
            tags[start_idx + i] = "B-PER" if i == 0 else "I-PER"
    except ValueError:
        pass

    if nome2:
        nome2_tokens = nome2.split()
        try:
            start_idx2 = tokens.index(nome2_tokens[0])
            for i, t in enumerate(nome2_tokens):
                tags[start_idx2 + i] = "B-PER" if i == 0 else "I-PER"
        except ValueError:
            pass

    dataset.append({"tokens": tokens, "tags": tags})

random.shuffle(dataset)
n_val = int(len(dataset) * VAL_RATIO)
val = dataset[:n_val]
train = dataset[n_val:]

with open(OUTPUT_TRAIN, "w", encoding="utf-8") as f:
    json.dump(train, f, indent=2, ensure_ascii=False)
with open(OUTPUT_VAL, "w", encoding="utf-8") as f:
    json.dump(val, f, indent=2, ensure_ascii=False)

print(f"Gerado {len(train)} frases de treino / {len(val)} frases de validação")


Gerado 4500 frases de treino / 500 frases de validação


In [None]:
train

[{'tokens': ['Você', 'conhece', 'Kewry?'], 'tags': ['O', 'O', 'O']},
 {'tokens': ['Hoje', 'Vadenildo', 'está', 'no', 'trabalho', '.'],
  'tags': ['O', 'B-PER', 'O', 'O', 'O', 'O']},
 {'tokens': ['A', 'mãe', 'de', 'Nauani', 'Madeira', 'chegou', 'agora', '.'],
  'tags': ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O']},
 {'tokens': ['Fui', 'ao', 'cinema', 'com', 'Eleude', '.'],
  'tags': ['O', 'O', 'O', 'O', 'B-PER', 'O']},
 {'tokens': ['Falei', 'com', 'Anatil', 'Vilarinho', 'ontem', '.'],
  'tags': ['O', 'O', 'B-PER', 'I-PER', 'O', 'O']},
 {'tokens': ['A', 'Anadilia', 'Meira', 'chegou', 'cedo', '.'],
  'tags': ['O', 'B-PER', 'I-PER', 'O', 'O', 'O']},
 {'tokens': ['Você', 'conhece', 'Sirso?'], 'tags': ['O', 'O', 'O']},
 {'tokens': ['O', 'Dayane', 'foi', 'ao', 'mercado', '.'],
  'tags': ['O', 'B-PER', 'O', 'O', 'O', 'O']},
 {'tokens': ['O',
   'Cleudsom',
   'Barroqueiro',
   'e',
   'participaram',
   'da',
   'reunião',
   '.'],
  'tags': ['O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O']

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "/content/train.json"})

dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
dataset = {
    "train": dataset["train"],
    "validation": dataset["test"]
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', use_fast=True)
LABELS = ["O", "B-PER", "I-PER"]
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  #BERTimbau
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                if label[word_idx].startswith("B-"):
                    label_ids.append(label2id["I-PER"])
                else:
                    label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = {}
tokenized_datasets["train"] = dataset["train"].map(tokenize_and_align_labels, batched=True)
tokenized_datasets["validation"] = dataset["validation"].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [
        [id2label[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [id2label[pred] for (pred, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-ner-pt",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    logging_steps=200,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

trainer.train()

#691359bd154d080a0237473b0c555d1dccb4eda5

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfabianaandrad12[0m ([33mfabianaandrad12-studant[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
200,0.0663
400,0.0018
600,0.0002
800,0.0001
1000,0.0029
1200,0.0001
1400,0.0001


TrainOutput(global_step=1521, training_loss=0.009416440357304105, metrics={'train_runtime': 213.4795, 'train_samples_per_second': 56.914, 'train_steps_per_second': 7.125, 'total_flos': 793696074278400.0, 'train_loss': 0.009416440357304105, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 3.801159982685931e-05,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 1.2224,
 'eval_samples_per_second': 368.128,
 'eval_steps_per_second': 23.724,
 'epoch': 3.0}

In [None]:
trainer.save_model("./bert-ner-pt")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

base_model = "neuralmind/bert-base-portuguese-cased"
model_path = "/content/bert-ner-pt/checkpoint-1521"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [None]:
text = "Fabiana foi a escola"
print(ner_pipeline(text))

[{'entity_group': 'PER', 'score': np.float32(0.99995244), 'word': 'Fabiana', 'start': 0, 'end': 7}]


In [None]:
text = "Andre and"
print(ner_pipeline(text))

[{'entity_group': 'PER', 'score': np.float32(0.9998152), 'word': 'Andre and', 'start': 0, 'end': 9}]


In [None]:
text = "rua rua"
print(ner_pipeline(text))

[]
