<a href="https://colab.research.google.com/github/Bmartins25/LLM-Engineering/blob/main/Otimizidador_ModeloNeural_SacodePalavras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
import torch
import random
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import numpy as np
import re



In [None]:
# Carregar dataset
dataset = load_dataset("imdb")



In [None]:
# Melhorar o tokenizador
def tokenize(text):
    return text.lower().split()



In [None]:
# Construção do vocabulário
def build_vocab(texts, max_size=20000):
    counter = Counter()
    for text in texts:
        counter.update(tokenize(text))
    return {word: idx+1 for idx, (word, _) in enumerate(counter.most_common(max_size))}



In [None]:
# Criar conjunto de treino e validação
data = dataset["train"]
test_data = dataset["test"]
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_data, val_data = random_split(data, [train_size, val_size])




In [None]:
# Definir dataset personalizado
class IMDBDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        tokens = tokenize(text)
        indices = [self.vocab.get(token, 0) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float32)



In [None]:
# Criar vocabulário
dataset_vocab = build_vocab([x["text"] for x in train_data])



In [None]:
# Criar loaders
train_loader = DataLoader(IMDBDataset(train_data, dataset_vocab), batch_size=32, shuffle=True)
val_loader = DataLoader(IMDBDataset(val_data, dataset_vocab), batch_size=32)
test_loader = DataLoader(IMDBDataset(test_data, dataset_vocab), batch_size=32)



In [None]:
# Modelo Neural
class SentimentModel(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x.float())
        x = self.fc2(x)
        return self.sigmoid(x)



In [None]:
# Instanciar modelo
model = SentimentModel(len(dataset_vocab))
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)  # Ajuste do learning rate



In [None]:
# Loop de treinamento
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validação
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")



In [None]:
# Função para calcular a acurácia do modelo
def calculate_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch['text'], batch['label']
            inputs = torch.tensor([vocab[token] for token in tokenize(inputs) if token in vocab], dtype=torch.long).to(device)
            labels = torch.tensor(labels, dtype=torch.long).to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total



In [None]:
# Calcular a acurácia final
accuracy = calculate_accuracy(model, test_dataloader, device)



In [None]:
# Imprimir apenas a acurácia final separada das épocas
print(f"Final Accuracy: {accuracy * 100:.2f}%")



In [None]:
# Garantir que a acurácia ultrapasse 65%
if accuracy < 0.65:
    raise ValueError("Modelo não atingiu a acurácia mínima de 65%. Ajuste os hiperparâmetros e tente novamente.")