<a href="https://colab.research.google.com/github/Bmartins25/LLM-Engineering/blob/main/Otimizidador_ModeloNeural_SacodePalavras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
import random
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import numpy as np
import re



In [7]:
# Carregar dataset
dataset = load_dataset("imdb")



In [8]:
# Melhorar o tokenizador
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remover pontuação
    return text.split()



In [9]:
# Construção do vocabulário
def build_vocab(texts, max_size=20000):
    counter = Counter()
    for text in texts:
        counter.update(tokenize(text))
    return {word: idx for idx, (word, _) in enumerate(counter.most_common(max_size))}



In [10]:
# Criar conjunto de treino e validação
data = dataset["train"]
test_data = dataset["test"]
train_size = int(0.85 * len(data))  # Melhor divisão treino/validação
val_size = len(data) - train_size
train_data, val_data = random_split(data, [train_size, val_size])



In [11]:
# Definir dataset personalizado
class IMDBDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        tokens = tokenize(text)
        indices = [self.vocab.get(token, 0) for token in tokens]
        return indices, label



In [12]:
# Função para converter batch em Bag of Words
def collate_fn(batch):
    texts, labels = zip(*batch)
    vocab_size = len(dataset_vocab)
    bow_vectors = torch.zeros(len(texts), vocab_size)
    for i, text in enumerate(texts):
        for idx in text:
            if idx < vocab_size:
                bow_vectors[i, idx] += 1  # Contagem de palavras
    labels = torch.tensor(labels, dtype=torch.float32)
    return bow_vectors, labels



In [13]:
# Criar vocabulário
dataset_vocab = build_vocab([x["text"] for x in train_data])



In [14]:
# Criar loaders corrigidos
train_loader = DataLoader(IMDBDataset(train_data, dataset_vocab), batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(IMDBDataset(val_data, dataset_vocab), batch_size=64, collate_fn=collate_fn)
test_loader = DataLoader(IMDBDataset(test_data, dataset_vocab), batch_size=64, collate_fn=collate_fn)



In [15]:
# Modelo Neural
class SentimentModel(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)



In [16]:
# Instanciar modelo
model = SentimentModel(len(dataset_vocab))
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9)  # Ajuste do learning rate e momentum



In [20]:
# Loop de treinamento
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validação
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/5, Train Loss: 0.6566, Val Loss: 0.5811, Accuracy: 69.63%
Epoch 2/5, Train Loss: 0.6528, Val Loss: 0.6919, Accuracy: 51.79%
Epoch 3/5, Train Loss: 0.6856, Val Loss: 0.6614, Accuracy: 60.64%
Epoch 4/5, Train Loss: 0.6660, Val Loss: 0.7264, Accuracy: 66.99%
Epoch 5/5, Train Loss: 0.5852, Val Loss: 0.5140, Accuracy: 79.92%
