In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
import numpy as np
import random
import json

In [None]:
MODEL = "neuralmind/bert-large-portuguese-cased"
BATCH_SIZE=35

In [None]:
MODEL = "ricardoz/BERTugues-base-portuguese-cased"
BATCH_SIZE=100

In [None]:
MODEL = 'google-bert/bert-base-multilingual-cased'
BATCH_SIZE=100

In [None]:
NUM_LABELS = 3

In [None]:
TRAIN_ARTICLES_PATH = "string"
TEST_ARTICLES_PATH = "string"

In [None]:
def convert_to_huggingface_2_classes():
    with open(TRAIN_ARTICLES_PATH, "r") as f:
        train_data = json.load(f)
    with open(TEST_ARTICLES_PATH, "r") as f2:
        test_data = json.load(f2)

    train_samples = []
    for item in train_data:
        train_samples.append({"text": item["text"], "label": 1 if int(item["class_label"]) == 2 else int(item['class_label'])})

    test_samples = []
    for item in test_data:
        test_samples.append({"text": item["text"], "label": 1 if int(item["class_label"]) == 2 else int(item['class_label'])})

    random.shuffle(train_samples)

    split_index = int(len(train_samples) * 0.9)
    train_split = train_samples[:split_index]
    val_split = train_samples[split_index:]

    train_dataset = Dataset.from_list(train_split)
    val_dataset = Dataset.from_list(val_split)
    test_dataset = Dataset.from_list(test_samples)

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })

In [None]:
def convert_to_huggingface():
    with open(TRAIN_ARTICLES_PATH, "r") as f:
        train_data = json.load(f)
    with open(TEST_ARTICLES_PATH, "r") as f2:
        test_data = json.load(f2)

    train_samples = []
    for item in train_data:
        train_samples.append({"text": item["text"], "label": int(item["class_label"])})

    test_samples = []
    for item in test_data:
        test_samples.append({"text": item["text"], "label": int(item["class_label"])})

    random.shuffle(train_samples)

    split_index = int(len(train_samples) * 0.9)
    train_split = train_samples[:split_index]
    val_split = train_samples[split_index:]

    train_dataset = Dataset.from_list(train_split)
    val_dataset = Dataset.from_list(val_split)
    test_dataset = Dataset.from_list(test_samples)

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [None]:
dataset = convert_to_huggingface()

In [None]:
print(len(dataset['train']))
print(len(dataset['validation']))
print(len(dataset['test']))

In [None]:
dataset = convert_to_huggingface_2_classes()

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=512)

train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Zero-shot evaluation
zeroShotTrainer = Trainer(
  model=model,
  eval_dataset=tokenized_datasets["test"],
  compute_metrics=compute_metrics
)
zero_shot_results = zeroShotTrainer.evaluate()
print(f"Zero-shot results for {MODEL}:\n")
print(f'Accuracy: {zero_shot_results.get("eval_accuracy") * 100:.1f}%')
print(f'F1: {zero_shot_results.get("eval_f1")* 100:.1f}%')
print(f'Recall: {zero_shot_results.get("eval_recall")* 100:.1f}%')
print(f'Precision: {zero_shot_results.get("eval_precision")* 100:.1f}%')

In [None]:
import time
now = time.time()
# Avaliar no conjunto de teste
test_results = trainer.predict(tokenized_datasets["test"])
then = time.time()
print(f"Tempo de avaliação: {then - now} segundos")

# Extraindo as previsões e as métricas
predictions = test_results.predictions
metrics = test_results.metrics

# Mostrando as métricas
print(f"Métricas de avaliação no conjunto de teste no {MODEL}:\n")
print(f'Accuracy: {metrics.get("test_accuracy") * 100:.1f}%')
print(f'F1: {metrics.get("test_f1")* 100:.1f}%')
print(f'Recall: {metrics.get("test_recall")* 100:.1f}%')
print(f'Precision: {metrics.get("test_precision")* 100:.1f}%')

In [None]:
#IGNORAR trainer.evaluate(tokenized_datasets["test"])