In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports do Projeto

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import time
import re
from transformers import (
    DistilBertTokenizer,  # Tokenizador específico do DistilBERT
    DistilBertForSequenceClassification,  # Modelo para classificação
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import accuracy_score

# Pré-processamento dos dados

In [None]:
# Definiçao de semente aleatoria para consistencia
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Verificar disponibilidade de GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

In [None]:
reviews_train_df = pd.read_csv('/kaggle/input/imdb-ptbr/imdb-reviews-pt-br.csv')
reviews_train_df.head()

In [None]:
# Converter os valores da coluna 'sentiment' para binário
reviews_train_df['sentiment'] = reviews_train_df['sentiment'].map({'neg': 0, 'pos': 1})

In [None]:
reviews_train_df.head()

In [None]:
# Ajustando df para colunas que importam
reviews_train_df_novo = reviews_train_df[['text_pt', 'sentiment']]
reviews_train_df_novo.head()

In [None]:
# Remoçao de palavras sem sentido semantico utilizando o spaCy, para manter o contexto

!pip install spacy
!python -m spacy download pt_core_news_sm

import spacy
spacy.prefer_gpu()
nlp = spacy.load("pt_core_news_sm")

def preprocess_text(text):
    text = text.lower()

    doc = nlp(text)

    cleaned_tokens = [
        token.lemma_ for token in doc
        if not token.is_punct
        and not token.is_space
        and not token.is_stop
        and not token.like_url
        and not token.like_email
        and token.is_alpha
        and len(token.text) > 2
    ]

    cleaned_text = " ".join(cleaned_tokens)
    return cleaned_text

In [None]:
reviews_train_df_novo.loc[:, 'text_pt_processed'] = reviews_train_df_novo['text_pt'].apply(preprocess_text)

In [None]:
reviews_train_df_novo.head()

In [None]:
reviews_train_df_novo = reviews_train_df_novo.drop('text_pt', axis=1)

In [None]:
# Ajustando ordens da coluna (por puro toc)
reviews_train_df_novo = reviews_train_df_novo[['text_pt_processed', 'sentiment']]
reviews_train_df_novo.head()

# Treinos e testes

In [None]:
#Tokenizacao do dataset
model_name = "adalbertojunior/distilbert-portuguese-cased"  # DistilBERT em português
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [None]:
# Separaçao dos dados de treinos e testes
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reviews_train_df_novo['text_pt_processed'].values,
    reviews_train_df_novo['sentiment'].values,
    test_size=0.2,
    random_state=SEED
    )

In [None]:
# Conversão para lista de Strings
train_texts = [str(text) for text in train_texts]  # Garante que são strings
val_texts = [str(text) for text in val_texts]

In [None]:
print(type(train_texts))
print(type(train_labels))

In [None]:
# classe para criaçao de um Dataset PyTorch permite otimizar o carregamento de dados e paralelizar a computação

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        )
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx],
        }

In [None]:
#Criaçao de dataset para treino e validaçao.
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

In [None]:
# Criacao de dataLoader para alimentar os modelos em lotes. O DataLoader permite carregar os dados em lotes (batch_size), tornando o treinamento mais eficiente
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Carregamento do modelo pré-treinado BERT para classificação
num_labels = len(set(train_labels))  # Número de classes (geralmente 2 para sentimentos: positivo/negativo)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    attention_probs_dropout_prob=0.3,  # Recomendado para evitar overfitting
    hidden_dropout_prob=0.3
)
model.to(device)  # Mover para GPU se disponível

In [None]:
# Função de perda para classificação
criterion = torch.nn.CrossEntropyLoss()

# Otimizador AdamW recomendado para transformers
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
# Definindo épocas e Early stop
num_epochs = 7
best_val_loss = float("inf")
patience = 3

def train_loop_fn(loader, model, optimizer, device, tokenizer):
    scaler = torch.amp.GradScaler('cuda')
    global best_val_loss
    patience_counter = 0
    best_model_path = None

    save_dir = "/content/drive/MyDrive/Colab Notebooks/imdb-nlp-analise-sentimento/final_model"
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items()}

            with autocast():
                outputs = model(**inputs)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        avg_loss = total_loss / len(loader)
        print(f"🏋️ Época {epoch+1}/{num_epochs} | Loss Treino: {avg_loss:.4f}")

        # Validação
        model.eval()
        total_val_loss = 0
        predictions, true_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)

                total_val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(inputs['labels'].cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        accuracy = accuracy_score(true_labels, predictions)
        print(f"Validação | Loss: {avg_val_loss:.4f} | Acurácia: {accuracy:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0

            best_model_path = os.path.join(save_dir, f"modelo_epoca_{epoch+1}.pth")
            torch.save(model.state_dict(), best_model_path)
            print(f"Melhor modelo salvo em: {best_model_path}")
        else:
            patience_counter += 1
            print(f"Paciência: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early Stopping ativado!")
            break

    return best_model_path

In [None]:
# Execução do treinamento
best_model_path = train_loop_fn(train_loader, model, optimizer, device, tokenizer)

# Carregando o melhor modelo
model.load_state_dict(torch.load(best_model_path))

# Salvando o modelo e o tokenizer no formato para MLOps
mlops_save_dir = "/kaggle/working/imdb-ptbrmodelo_bert_treinado"
os.makedirs(mlops_save_dir, exist_ok=True)

model.save_pretrained(mlops_save_dir)
tokenizer.save_pretrained(mlops_save_dir)

print(f"Modelo e tokenizer salvos para MLOps em: {mlops_save_dir}")