## Setup

In [18]:
%%capture
!pip install datasets
!pip install evaluate

In [19]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WB")

wandb.login(key=secret_value_0)
run = wandb.init(
    project='BertKaggle', 
    job_type="training", 
    anonymous="allow"
)



## Carregando e processando dataset

In [20]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("csv", data_files="/kaggle/input/nlp-getting-started/train.csv")
dataset = dataset.remove_columns(['id', 'keyword', 'location'])
dataset = dataset.rename_column("target", "labels")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 7613
    })
})

In [21]:
dataset = dataset['train'].train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 762
    })
})

## Tokenizer Dataset
Como você saber a tokenização é uma etapa fundamental para treinar modelos de NLP. Esta etapa consiste em transformar os dados de um modo que o nosso LLM possa compreender. Neste caso o BERT. Também observe que cada LLM tem um Tokenizer específico.

In [22]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 762
    })
})

In [24]:
# Crie versões menores do dataset (para um treinamento mais rápido)
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]

## Treinar com Pytorch Trainer


In [25]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-cased",
    num_labels=2,
    torch_dtype="auto"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Avaliar seu modelo
Como a class Trainer não avalia nosso modelo, precisamos fazer isto manualmente.

In [26]:
import numpy as np
import evaluate

from transformers import TrainingArguments, Trainer

metric = evaluate.load("accuracy")

# Crie uma função para computar a métrica
def compute_metrics(eval_pred):
    logits, target = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=target)

# Escolha a estratégia para avaliar seu modelo
# training_args = TrainingArguments(eval_strategy="epoch")

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_strategy = "epoch",
    learning_rate=3e-5,
    num_train_epochs=2,
    weight_decay=1e-5,
    logging_steps=1
)




In [27]:
# Execute o treino
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1051,0.39473,0.83727
2,0.0592,0.396483,0.822835




TrainOutput(global_step=216, training_loss=0.41047511199765185, metrics={'train_runtime': 786.6156, 'train_samples_per_second': 17.419, 'train_steps_per_second': 0.275, 'total_flos': 3605147680542720.0, 'train_loss': 0.41047511199765185, 'epoch': 2.0})

## Carregar e Inferência

In [28]:
trainer.save_model("myModel")

In [187]:
from transformers import pipeline
pipe = pipeline('text-classification', model='/kaggle/working/myModel', tokenizer=tokenizer, device=0)
print(f'{round(pipe("Fire!")[0]["score"], 3)}')

Device set to use cuda:0


0.724


In [188]:
import datasets
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

test = datasets.load_dataset("csv", data_files="/kaggle/input/nlp-getting-started/test.csv", split='train')
test = test.remove_columns(['id', 'keyword', 'location'])
test

Dataset({
    features: ['text'],
    num_rows: 3263
})

In [189]:
test_tk = test.map(tokenize_function)
test_tk

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3263
})

In [190]:
preds = []
for out in tqdm(pipe(KeyDataset(test, "text"))):
    preds.append(out['score'])

  0%|          | 0/3263 [00:00<?, ?it/s]

In [191]:
sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sub['target'] = preds
sub

Unnamed: 0,id,target
0,0,0.974353
1,2,0.978115
2,3,0.969103
3,9,0.976470
4,11,0.984125
...,...,...
3258,10861,0.703833
3259,10865,0.991611
3260,10868,0.990959
3261,10874,0.780585
