<a href="https://colab.research.google.com/github/FabianaAndrade/ner-pii/blob/main/treino_ner_pii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
!pip install -U transformers seqeval faker

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from sklearn import preprocessing
from seqeval.metrics import precision_score, recall_score, f1_score

In [None]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print("device: ", device)

In [None]:
df_names = pd.read_csv('/content/all-pt-br-names.csv')
coluna = df_names.columns[1]
df_names[coluna].dropna().to_csv("nomes.txt", index=False, header=False)

In [None]:
print("numero de exemplos: ", len(df_names))

In [None]:
def load_lexicon(path):
    with open(path, encoding="utf-8") as f:
        return [l.strip() for l in f if l.strip()]
sobrenomes = load_lexicon("/content/portuguese_surnames.txt")
nomes = load_lexicon("/content/nomes.txt")

In [None]:
import random
import json
from faker import Faker

NUM_EXAMPLES = 5000
VAL_RATIO = 0.1
OUTPUT_TRAIN = "train.json"
OUTPUT_VAL = "validation.json"

fake = Faker("pt_BR")

TEMPLATES = [
    "Ontem encontrei {nome}.",
    "O {nome} foi ao mercado.",
    "A {nome} chegou cedo.",
    "Hoje {nome} está no trabalho.",
    "Você conhece {nome}?",
    "{nome} comprou um carro novo.",
    "Falei com {nome} ontem.",
    "O relatório foi entregue por {nome}.",
    "A mãe de {nome} chegou agora.",
    "Ontem vi {nome} no shopping.",
    "{nome} e {nome2} viajaram juntos.",
    "Fui ao cinema com {nome}.",
    "O {nome} e {nome2} participaram da reunião.",
    "{nome} mora em {cidade}.",
    "{nome} enviou uma mensagem para {nome2} ontem."
]

def tokenize(sentence):
    return sentence.replace(".", " .").replace(",", " ,").split()

dataset = []

for _ in range(NUM_EXAMPLES):
    nome = random.choice(nomes).title()
    if random.random() < 0.5:
        nome += " " + random.choice(sobrenomes).title()

    nome2 = None
    if random.random() < 0.3:
        nome2 = random.choice(nomes).title()
        if random.random() < 0.5:
            nome2 += " " + random.choice(sobrenomes).title()

    cidade = fake.city()

    template = random.choice(TEMPLATES)
    sentence = template.format(nome=nome, nome2=nome2 or "", cidade=cidade)
    sentence = " ".join(sentence.split())
    tokens = tokenize(sentence)

    tags = ["O"] * len(tokens)

    nome_tokens = nome.split()
    try:
        start_idx = tokens.index(nome_tokens[0])
        for i, t in enumerate(nome_tokens):
            tags[start_idx + i] = "B-PER" if i == 0 else "I-PER"
    except ValueError:
        pass

    if nome2:
        nome2_tokens = nome2.split()
        try:
            start_idx2 = tokens.index(nome2_tokens[0])
            for i, t in enumerate(nome2_tokens):
                tags[start_idx2 + i] = "B-PER" if i == 0 else "I-PER"
        except ValueError:
            pass

    dataset.append({"tokens": tokens, "tags": tags})

random.shuffle(dataset)
n_val = int(len(dataset) * VAL_RATIO)
val = dataset[:n_val]
train = dataset[n_val:]

with open(OUTPUT_TRAIN, "w", encoding="utf-8") as f:
    json.dump(train, f, indent=2, ensure_ascii=False)
with open(OUTPUT_VAL, "w", encoding="utf-8") as f:
    json.dump(val, f, indent=2, ensure_ascii=False)

print(f"Gerado {len(train)} frases de treino / {len(val)} frases de validação")


In [52]:
train

[{'tokens': ['Você', 'conhece', 'Kewry?'], 'tags': ['O', 'O', 'O']},
 {'tokens': ['Hoje', 'Vadenildo', 'está', 'no', 'trabalho', '.'],
  'tags': ['O', 'B-PER', 'O', 'O', 'O', 'O']},
 {'tokens': ['A', 'mãe', 'de', 'Nauani', 'Madeira', 'chegou', 'agora', '.'],
  'tags': ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O']},
 {'tokens': ['Fui', 'ao', 'cinema', 'com', 'Eleude', '.'],
  'tags': ['O', 'O', 'O', 'O', 'B-PER', 'O']},
 {'tokens': ['Falei', 'com', 'Anatil', 'Vilarinho', 'ontem', '.'],
  'tags': ['O', 'O', 'B-PER', 'I-PER', 'O', 'O']},
 {'tokens': ['A', 'Anadilia', 'Meira', 'chegou', 'cedo', '.'],
  'tags': ['O', 'B-PER', 'I-PER', 'O', 'O', 'O']},
 {'tokens': ['Você', 'conhece', 'Sirso?'], 'tags': ['O', 'O', 'O']},
 {'tokens': ['O', 'Dayane', 'foi', 'ao', 'mercado', '.'],
  'tags': ['O', 'B-PER', 'O', 'O', 'O', 'O']},
 {'tokens': ['O',
   'Cleudsom',
   'Barroqueiro',
   'e',
   'participaram',
   'da',
   'reunião',
   '.'],
  'tags': ['O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O']

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "/content/train.json"})

dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
dataset = {
    "train": dataset["train"],
    "validation": dataset["test"]
}

In [None]:
# @title
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', use_fast=True)
LABELS = ["O", "B-PER", "I-PER"]
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  #BERTimbau
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}

In [None]:

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                if label[word_idx].startswith("B-"):
                    label_ids.append(label2id["I-PER"])
                else:
                    label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = {}
tokenized_datasets["train"] = dataset["train"].map(tokenize_and_align_labels, batched=True)
tokenized_datasets["validation"] = dataset["validation"].map(tokenize_and_align_labels, batched=True)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)


In [None]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [
        [id2label[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [id2label[pred] for (pred, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-ner-pt",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    logging_steps=200,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

trainer.train()

#691359bd154d080a0237473b0c555d1dccb4eda5

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./bert-ner-pt")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

base_model = "neuralmind/bert-base-portuguese-cased"
model_path = "/content/bert-ner-pt/checkpoint-1521"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [53]:
text = "Fabiana foi a escola"
print(ner_pipeline(text))

[{'entity_group': 'PER', 'score': np.float32(0.99995244), 'word': 'Fabiana', 'start': 0, 'end': 7}]


In [54]:
text = "Andre and"
print(ner_pipeline(text))

[{'entity_group': 'PER', 'score': np.float32(0.9998152), 'word': 'Andre and', 'start': 0, 'end': 9}]


In [55]:
text = "rua rua"
print(ner_pipeline(text))

[]
