# BERT Tutorial

## 1. Instalação



In [1]:
!pip install torch
!pip install transformers
!pip install pandas



## 2. Bibliotecas

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## 3. Ingestão

In [3]:
# Carrega o dataset
df = pd.read_csv('meus-dados-UTF8.csv', sep=';')
df.head()

Unnamed: 0,texto,sentimento
0,Confira os resultados dos nossos fundos no mês...,NEUTRAL
1,A Alvarez & Marsal estará conosco no Sportainm...,NEUTRAL
2,#Repost btgpactual with make_repost ・・・ Entend...,NEUTRAL
3,Minuto touro de ouro,POSITIVE
4,@ricktolledo Sim,NEUTRAL


## 4. Preparação dos dados

In [4]:
# Mapeia os sentimentos para valores numéricos
df['sentimento'] = df['sentimento'].map({'POSITIVE': 0, 'NEUTRAL': 1, 'NEGATIVE': 2})
df.head()

Unnamed: 0,texto,sentimento
0,Confira os resultados dos nossos fundos no mês...,1
1,A Alvarez & Marsal estará conosco no Sportainm...,1
2,#Repost btgpactual with make_repost ・・・ Entend...,1
3,Minuto touro de ouro,0
4,@ricktolledo Sim,1


In [5]:
# Divide o dataset em treino e teste
train_text, temp_text, train_labels, temp_labels = train_test_split(df['texto'], df['sentimento'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=df['sentimento'])

## 5. Tokenização

In [6]:
# Carrega o BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
tokenizer

BertTokenizer(name_or_path='neuralmind/bert-base-portuguese-cased', vocab_size=29794, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [7]:
# Tokeniza os datasets
train_text = tokenizer(list(train_text), padding=True, truncation=True, max_length=256, return_tensors="pt")
temp_text = tokenizer(list(temp_text), padding=True, truncation=True, max_length=256, return_tensors="pt")

# 6. DataLoaders

In [8]:
# Converte labels para tensores
train_labels = torch.tensor(list(train_labels))
temp_labels = torch.tensor(list(temp_labels))

# Cria o TensorDataset e defina o DataLoader
train_data = TensorDataset(train_text['input_ids'], train_text['attention_mask'], train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(temp_text['input_ids'], temp_text['attention_mask'], temp_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

# 7. Criação do Modelo

In [9]:
# Carrega o modelo BERT
model = BertForSequenceClassification.from_pretrained(
    'neuralmind/bert-base-portuguese-cased',
    num_labels = 3, # Número de classes de saída (sentimentos)
    output_attentions = False,
    output_hidden_states = False,
)

# Move o modelo para GPU, se disponível
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

# 8. Treinamento



In [10]:
# Define a taxa de aprendizado
learning_rate = 2e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
epochs = 4

In [11]:
for _ in range(epochs):
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print('Loss: ', avg_train_loss)

## 9. Avaliação do Modelo

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

In [None]:
model.eval()

# Listas para armazenar previsões e rótulos verdadeiros
predictions, true_labels = [], []

eval_loss = 0
nb_eval_steps = 0

for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Armazena as previsões e rótulos verdadeiros
    batch_preds = np.argmax(logits, axis=1)
    predictions.extend(batch_preds)
    true_labels.extend(label_ids)

    # Calcula a acurácia para este lote de dados de teste e acumule-a sobre todos os lotes.
    eval_loss += flat_accuracy(logits, label_ids)
    nb_eval_steps += 1

print('Acurácia: ', eval_loss/nb_eval_steps)

# Calcula a matriz de confusão
conf_mat = confusion_matrix(true_labels, predictions)

# Imprime a matriz de confusão
print('Matriz de Confusão:')
print(conf_mat)


# Calcula e imprime a precisão, recall e F1
report = classification_report(true_labels, predictions, target_names=['POSITIVE', 'NEUTRAL', 'NEGATIVE'])
print(report)


Acurácia:  0.8633152173913043
Matriz de Confusão:
[[1126  130   73]
 [ 142 1387   50]
 [  50   58  642]]
              precision    recall  f1-score   support

    POSITIVE       0.85      0.85      0.85      1329
     NEUTRAL       0.88      0.88      0.88      1579
    NEGATIVE       0.84      0.86      0.85       750

    accuracy                           0.86      3658
   macro avg       0.86      0.86      0.86      3658
weighted avg       0.86      0.86      0.86      3658



# 10. Persistência do Modelo

In [None]:
# Salva o modelo inteiro
torch.save(model, '/content/drive/MyDrive/INTELI/model.pt')

# 11. Uso do Modelo

In [None]:
# Carrega o modelo inteiro
meu_modelo = torch.load('/content/drive/MyDrive/INTELI/model.pt')

# 12. Previsões

In [None]:
# Criado anteriormente
# tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [None]:
def predict(model, tokenizer, sentence):
    model.eval()

    inputs = tokenizer.encode_plus(
        sentence,
        None,
        add_special_tokens=True,
        max_length=200,
        padding='max_length',
        truncation=True,
        return_token_type_ids=True
    )

    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)

    ids = ids.to(device)
    mask = mask.to(device)

    with torch.no_grad():
        outputs = model(ids, token_type_ids=None, attention_mask=mask)

    outputs = outputs[0].detach().cpu().numpy()
    predict_code = np.argmax(outputs, axis=1)[0]

    inversed_map = {0: 'POSITIVE', 1: 'NEUTRAL', 2: 'NEGATIVE'}
    return inversed_map[predict_code]

In [None]:
# Testa com uma sentença de exemplo
sentença = "Estou horrível hoje!"

# Realiza a previsão
predict(model, tokenizer, sentença)

'NEGATIVE'