In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-ptbr/imdb-reviews-pt-br.csv


# Imports do Projeto

## Ajuste para trabalhar com TPU do kaggle

In [None]:
!pip install --upgrade torch_xla
!pip install --upgrade torch
!pip install --upgrade transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl (766.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m632.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cusparselt-cu12==0.6.2
  Downloading nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl (150.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.1/150.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting triton==3.2.0
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.1/253.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm

## Imports Modelo

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import re

# Pré-processamento dos dados

In [None]:
# Definiçao de semente aleatoria para consistencia
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
reviews_train_df = pd.read_csv('/kaggle/input/imdb-ptbr/imdb-reviews-pt-br.csv')
reviews_train_df.head()

In [None]:
# Converter os valores da coluna 'sentiment' para binário
reviews_train_df['sentiment'] = reviews_train_df['sentiment'].map({'neg': 0, 'pos': 1})

In [None]:
# Ajustando df para colunas que importam
reviews_train_df_novo = reviews_train_df[['text_pt', 'sentiment']]
reviews_train_df_novo.head()

In [None]:
# Remoçao de palavras sem sentido semantico utilizando o spaCy, para manter o contexto

!pip install spacy
!python -m spacy download pt_core_news_sm

import spacy

nlp = spacy.load("pt_core_news_sm")

def preprocess_text(text):
    text = text.lower()

    doc = nlp(text)

    cleaned_tokens = [
        token.lemma_ for token in doc
        if not token.is_punct
        and not token.is_space
        and not token.is_stop
        and not token.like_url
        and not token.like_email
        and token.is_alpha
        and len(token.text) > 2
    ]

    cleaned_text = " ".join(cleaned_tokens)
    return cleaned_text

In [None]:
reviews_train_df_novo['text_pt_processed'] = reviews_train_df_novo['text_pt'].apply(preprocess_text)

In [None]:
reviews_train_df_novo.head()

In [None]:
# Salvar o DataFrame processado
reviews_train_df_novo.to_csv('/kaggle/input/imdb-ptbr/reviews_train_df_novo.csv', index=False)

In [None]:
reviews_train_df_novo = reviews_train_df_novo.drop('text_pt', axis=1)

In [None]:
# Ajustando ordens da coluna (por puro toc)
reviews_train_df_novo = reviews_train_df_novo[['text_pt_processed', 'sentiment']]
reviews_train_df_novo.head()

# Treinos e testes

In [None]:
# Tokenizaçao do dados
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
# Sepraçao dos dados de treinos e testes
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reviews_train_df_novo['text_pt_processed'].values,
    reviews_train_df_novo['sentiment'].values,
    test_size=0.2,
    random_state=SEED
    )

In [None]:
# Conversão para lista de Strings
train_texts = [str(text) for text in train_texts]  # Garante que são strings
val_texts = [str(text) for text in val_texts]

In [None]:
print(type(train_texts))
print(type(train_labels))

In [None]:
# classe para criaçao de um Dataset PyTorch permite otimizar o carregamento de dados e paralelizar a computação

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx],
        }

In [None]:
#Criaçao de dataset para treino e validaçao.
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

In [None]:
# Criacao de dataLoader para alimentar os modelos em lotes. O DataLoader permite carregar os dados em lotes (batch_size), tornando o treinamento mais eficiente
batch_size = 16

train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True
)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=train_sampler,
    drop_last=True,
    num_workers=4
)

val_sampler = torch.utils.data.distributed.DistributedSampler(
    val_dataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    sampler=val_sampler,
    drop_last=False,
    num_workers=4
)


In [None]:
# Carregamento do modelo pré-treinado BERT para classificação
num_labels = len(set(train_labels))  # Número de classes (geralmente 2 para sentimentos: positivo/negativo)
model = BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=num_labels).to(device)


In [None]:
# Função de perda para classificação
criterion = torch.nn.CrossEntropyLoss()

# Otimizador AdamW recomendado para transformers
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
# Definindo épocas e Early stop
num_epochs = 5
best_val_loss = float("inf")  # Melhor perda de validação inicial
patience = 2  # Paciência para early stopping

# Agendador de taxa de aprendizado será definido dentro da função de treinamento

def train_loop_fn(loader, model, optimizer, device):
    num_training_steps = len(loader) * num_epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    
    best_val_loss = float("inf")
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        loader.sampler.set_epoch(epoch)

        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            xm.optimizer_step(optimizer)
            optimizer.zero_grad()
            lr_scheduler.step()

        avg_loss = xm.mesh_reduce('avg_train_loss', total_loss / len(loader), lambda x: sum(x) / len(x))
        
        if xm.is_master_ordinal():
            print(f"🔹 Época {epoch+1}: Loss médio: {avg_loss:.4f}")

        # Validação
        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)
                total_val_loss += outputs.loss.item()

        avg_val_loss = xm.mesh_reduce('avg_val_loss', total_val_loss / len(val_loader), lambda x: sum(x) / len(x))

        if xm.is_master_ordinal():
            print(f"🔸 Época {epoch+1}: Loss de Validação: {avg_val_loss:.4f}")

            # Early Stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                print(f"⚠️ EarlyStopping: Nenhuma melhora na validação ({patience_counter}/{patience})")

            if patience_counter >= patience:
                print("⏹️ EarlyStopping ativado! Interrompendo o treinamento.")
                break

# Função principal para XLA
def _mp_fn(index):
    model = BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=num_labels).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
train_loop_fn(train_loader, model, optimizer, device)

# Iniciar treinamento distribuído
xmp.spawn(_mp_fn, args=())


In [None]:
def evaluate(model, val_loader):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            
            preds = xm.mesh_reduce('preds', preds.cpu().numpy(), lambda x: np.concatenate(x))
            labels = xm.mesh_reduce('labels', inputs['labels'].cpu().numpy(), lambda x: np.concatenate(x))
            
            if xm.is_master_ordinal():
                predictions.extend(preds)
                true_labels.extend(labels)
    
    if xm.is_master_ordinal():
        acc = accuracy_score(true_labels, predictions)
        print(f"🎯 Acurácia: {acc:.4f}")
    return acc

# Salvando Modelo

In [None]:
# Caminho onde o modelo será salvo
save_path = "/kaggle/working/imdb-ptbrmodelo_bert_treinado"

# Salvar modelo treinado
model.save_pretrained(save_path)

# Salvar o tokenizador também (importante para carregar o modelo depois)
tokenizer.save_pretrained(save_path)