# BERTimbau

In [1]:
! pip install datasets transformers seqeval evaluate transformers[torch]



# Imports iniciais

In [1]:
model_checkpoint = "neuralmind/bert-large-portuguese-cased"

In [2]:
import random
import pandas as pd
from datasets import load_dataset, load_metric, ClassLabel, Features, Value
from IPython.display import display, HTML
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from transformers.trainer_callback import EarlyStoppingCallback
from evaluate import load

random.seed(42)

# Tratando os dados

In [3]:
PATH_DATASET = "../../datasets/conversas_filmes.csv"

In [4]:
df = pd.read_csv(PATH_DATASET)
df.head()

Unnamed: 0,Conversa,Intenção
0,Quais são os filmes em cartaz?,Assistir filme
1,Onde posso comprar ingressos para os filmes de...,Comprar ingresso
2,"Gostaria de saber mais sobre o filme ""Aventura...",Detalhes do filme
3,"Quando é a próxima sessão de ""Viagem Espacial""?",Checar sessões
4,Qual é o melhor filme em exibição agora?,Assistir filme


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Conversa,Intenção
0,Quero saber mais sobre os detalhes técnicos na...,Detalhes do filme
1,"Estou a fim de um clássico,qual é o filme mais...",Assistir filme
2,Estou procurando por um filme romântico para a...,Assistir filme
3,"Estou curioso sobre os prêmios recebidos por ""...",Detalhes do filme
4,Gostaria de comprar ingressos para a sessão da...,Comprar ingresso


In [6]:
intetions_to_id = {list(df["Intenção"].unique())[i]: i for i in range(len(df["Intenção"].unique()))}

In [7]:
intetions_to_id

{'Detalhes do filme': 0,
 'Assistir filme': 1,
 'Comprar ingresso': 2,
 'Checar sessões': 3}

In [8]:
id_to_intentions = {it: key for key, it in intetions_to_id.items()}
id_to_intentions

{0: 'Detalhes do filme',
 1: 'Assistir filme',
 2: 'Comprar ingresso',
 3: 'Checar sessões'}

In [9]:
def to_dict(elements: list, hash:dict=None) -> dict:
    if hash:
        return {"text": [item for item in elements[0]], "labels": [hash[item] for item in elements[1]]}
    return {"text": [item for item in elements], "labels": [item for item in elements[1]]}

In [10]:
def to_csv(di:dict, filename:str) -> None:
    pd.DataFrame.from_dict(di).to_csv(filename, index=False)

In [12]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
to_csv(to_dict([
    df[:int(len(df)*0.6)]["Conversa"].values,
    df[:int(len(df)*0.6)]["Intenção"].values
], intetions_to_id), "../../datasets/class_train.csv")

to_csv(to_dict([
    df[int(len(df)*0.6):int(len(df)*0.8)]["Conversa"].values,
    df[int(len(df)*0.6):int(len(df)*0.8)]["Intenção"].values
], intetions_to_id), "../../datasets/class_test.csv")

to_csv(to_dict([
    df[int(len(df)*0.8):]["Conversa"].values,
    df[int(len(df)*0.8):]["Intenção"].values
], intetions_to_id), "../../datasets/class_val.csv")

In [14]:
data_files = {
    "train": "class_train.csv",
    "validation": "class_test.csv",
    "test": "class_val.csv"
}

In [15]:
ft = Features({
    "text": Value(dtype="string", id=None), 
    "labels": ClassLabel(num_classes=len(intetions_to_id), names=list(intetions_to_id.keys()), id=None)
})

In [16]:
datasets = load_dataset(path="../../datasets",
                        data_files=data_files,
                        features=ft)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 246
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 82
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 83
    })
})

In [18]:
datasets["train"][0]

{'text': 'Pode me dizer as sessões disponíveis para "Mistério nas Estrelas" nesta semana?',
 'labels': 3}

In [19]:
datasets["train"].features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['Detalhes do filme', 'Assistir filme', 'Comprar ingresso', 'Checar sessões'], id=None)}

# Preprocessamento

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [21]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True)
    return tokenized_inputs

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
tokenized_dataset = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/246 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 246
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 82
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 83
    })
})

In [25]:
tokenized_dataset["train"].features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['Detalhes do filme', 'Assistir filme', 'Comprar ingresso', 'Checar sessões'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [26]:
# tokenized_dataset = tokenized_dataset.remove_columns(["text", "token_type_ids", "attention_mask", "label"])

In [27]:
tokenized_dataset["train"].features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['Detalhes do filme', 'Assistir filme', 'Comprar ingresso', 'Checar sessões'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Fine-tuning

In [28]:
accuracy = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=len(id_to_intentions), id2label=id_to_intentions, label2id=intetions_to_id
)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args = TrainingArguments(
    output_dir="classificador_transformers",
    overwrite_output_dir=True,
    do_train = True,
    do_eval = True,
    do_predict = True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=5,
    learning_rate=2e-5,
    label_names=["labels"],
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=50,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing = False,
    eval_steps = 250,
    logging_steps = 250,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [31]:
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.24578857421875, 'eval_accuracy': 0.5609756097560976, 'eval_runtime': 0.5055, 'eval_samples_per_second': 162.229, 'eval_steps_per_second': 1.978, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1415328979492188, 'eval_accuracy': 0.6219512195121951, 'eval_runtime': 0.5165, 'eval_samples_per_second': 158.77, 'eval_steps_per_second': 1.936, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.97991544008255, 'eval_accuracy': 0.7439024390243902, 'eval_runtime': 0.5035, 'eval_samples_per_second': 162.874, 'eval_steps_per_second': 1.986, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.914230465888977, 'eval_accuracy': 0.7560975609756098, 'eval_runtime': 0.4814, 'eval_samples_per_second': 170.324, 'eval_steps_per_second': 2.077, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.8426656126976013, 'eval_accuracy': 0.8170731707317073, 'eval_runtime': 0.4814, 'eval_samples_per_second': 170.323, 'eval_steps_per_second': 2.077, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7630676627159119, 'eval_accuracy': 0.8658536585365854, 'eval_runtime': 0.4914, 'eval_samples_per_second': 166.854, 'eval_steps_per_second': 2.035, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6765323877334595, 'eval_accuracy': 0.9146341463414634, 'eval_runtime': 0.4804, 'eval_samples_per_second': 170.678, 'eval_steps_per_second': 2.081, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Checkpoint destination directory classificador_transformers\checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.5081140398979187, 'eval_accuracy': 0.975609756097561, 'eval_runtime': 0.4884, 'eval_samples_per_second': 167.88, 'eval_steps_per_second': 2.047, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Checkpoint destination directory classificador_transformers\checkpoint-11 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.43156957626342773, 'eval_accuracy': 0.9878048780487805, 'eval_runtime': 0.5195, 'eval_samples_per_second': 157.852, 'eval_steps_per_second': 1.925, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3594989478588104, 'eval_accuracy': 0.9878048780487805, 'eval_runtime': 0.4734, 'eval_samples_per_second': 173.204, 'eval_steps_per_second': 2.112, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2920590937137604, 'eval_accuracy': 0.9878048780487805, 'eval_runtime': 0.4754, 'eval_samples_per_second': 172.475, 'eval_steps_per_second': 2.103, 'epoch': 11.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2374422252178192, 'eval_accuracy': 0.9878048780487805, 'eval_runtime': 0.4764, 'eval_samples_per_second': 172.112, 'eval_steps_per_second': 2.099, 'epoch': 12.0}
{'train_runtime': 196.3588, 'train_samples_per_second': 62.64, 'train_steps_per_second': 0.255, 'train_loss': 0.27528752599443707, 'epoch': 12.0}


TrainOutput(global_step=14, training_loss=0.27528752599443707, metrics={'train_runtime': 196.3588, 'train_samples_per_second': 62.64, 'train_steps_per_second': 0.255, 'train_loss': 0.27528752599443707, 'epoch': 12.0})

In [32]:
trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.43156957626342773,
 'eval_accuracy': 0.9878048780487805,
 'eval_runtime': 0.4454,
 'eval_samples_per_second': 184.103,
 'eval_steps_per_second': 2.245,
 'epoch': 12.0}

In [33]:
label_list = list(id_to_intentions.keys())

In [34]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=1)
predictions

  0%|          | 0/1 [00:00<?, ?it/s]

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 3, 0, 1, 0, 1, 3, 0, 1, 2, 1, 3,
       1, 1, 0, 3, 0, 3, 0, 2, 3, 0, 0, 1, 1, 2, 1, 0, 0, 0, 1, 1, 3, 0,
       3, 3, 1, 1, 1, 0, 1, 3, 3, 1, 2, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3,
       3, 1, 2, 0, 3, 0, 0, 2, 0, 1, 0, 0, 3, 0, 2, 2, 1], dtype=int64)

In [35]:
f1 = load("f1")
recall = load("recall")
precision = load("precision")

In [36]:
results_acc = accuracy.compute(predictions=predictions, references=tokenized_dataset["test"]["labels"])
results_f1 = f1.compute(predictions=predictions, references=tokenized_dataset["test"]["labels"],average='micro')
results_recall = recall.compute(predictions=predictions, references=tokenized_dataset["test"]["labels"], average='micro')
results_precision = precision.compute(predictions=predictions, references=tokenized_dataset["test"]["labels"], average='micro')

results_acc, results_f1, results_recall, results_precision

({'accuracy': 0.9879518072289156},
 {'f1': 0.9879518072289156},
 {'recall': 0.9879518072289156},
 {'precision': 0.9879518072289156})