In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Cargar los datos
train = pd.read_csv('train.csv')

# Limpiar y tokenizar textos
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

train['message'] = train['message'].apply(preprocess_text)

# Crear vocabulario
all_words = [word for tokens in train['message'] for word in tokens]
vocab = sorted(set(all_words))
word2idx = {word: idx+1 for idx, word in enumerate(vocab)} # +1 para reservar el índice 0 para padding

# Convertir palabras a índices
def encode_text(tokens):
    return [word2idx[word] for word in tokens]

train['message'] = train['message'].apply(encode_text)

# Codificar etiquetas
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])

# Padding de secuencias
def pad_sequences(seq, maxlen):
    if len(seq) < maxlen:
        seq = [0]*(maxlen-len(seq)) + seq
    else:
        seq = seq[:maxlen]
    return seq

MAXLEN = 100  # Longitud máxima de las secuencias
train['message'] = train['message'].apply(lambda x: pad_sequences(x, MAXLEN))

# Dividir en conjunto de entrenamiento y validación
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['message'].tolist(), train['label'].tolist(), test_size=0.2, random_state=42
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             message label
0  [saw, movie, new, york, city, waiting, bus, ne...   neg
1  [german, film, 1974, something, women, come, c...   neg
2  [attempted, watching, movie, twice, even, fast...   neg
3  [birthday, small, boys, tells, mother, son, wa...   neg
4  [person, wrote, review, enough, sweating, spit...   pos


In [9]:
# Dataset y DataLoader
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SentimentGRU(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=2, dropout=0.5):
        super(SentimentGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Parámetros ajustados
VOCAB_SIZE = len(word2idx) + 1
EMBED_SIZE = 128
HIDDEN_SIZE = 256  # Aumentar tamaño de la capa oculta
OUTPUT_SIZE = 1
NUM_LAYERS = 2
DROPOUT = 0.5

model = SentimentGRU(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, DROPOUT).to(device)


In [18]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        train_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs.squeeze(), labels.float())
            train_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        train_loss /= len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}')
        
        # Evaluación en el conjunto de validación
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for val_texts, val_labels in val_loader:
                val_texts, val_labels = val_texts.to(device), val_labels.to(device)
                val_outputs = model(val_texts)
                val_loss += criterion(val_outputs.squeeze(), val_labels.float()).item()
                predicted = torch.round(torch.sigmoid(val_outputs.squeeze()))
                total += val_labels.size(0)
                correct += (predicted == val_labels).sum().item()
        
        val_loss /= len(val_loader)
        val_accuracy = correct / total
        print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%')
        model.train()

train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1/20, Train Loss: 0.6630
Validation Loss: 0.6855, Validation Accuracy: 51.94%
Epoch 2/20, Train Loss: 0.6678
Validation Loss: 0.5997, Validation Accuracy: 69.48%
Epoch 3/20, Train Loss: 0.4324
Validation Loss: 0.4203, Validation Accuracy: 81.56%
Epoch 4/20, Train Loss: 0.2688
Validation Loss: 0.4475, Validation Accuracy: 81.44%
Epoch 5/20, Train Loss: 0.1716
Validation Loss: 0.4480, Validation Accuracy: 83.60%
Epoch 6/20, Train Loss: 0.1185
Validation Loss: 0.4949, Validation Accuracy: 83.56%
Epoch 7/20, Train Loss: 0.0708
Validation Loss: 0.5771, Validation Accuracy: 83.00%
Epoch 8/20, Train Loss: 0.0502
Validation Loss: 0.7489, Validation Accuracy: 83.10%
Epoch 9/20, Train Loss: 0.0290
Validation Loss: 0.7539, Validation Accuracy: 81.90%
Epoch 10/20, Train Loss: 0.0249
Validation Loss: 0.7373, Validation Accuracy: 83.16%
Epoch 11/20, Train Loss: 0.0207
Validation Loss: 0.7721, Validation Accuracy: 82.28%
Epoch 12/20, Train Loss: 0.0189
Validation Loss: 0.9889, Validation Accura

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            predictions = torch.round(torch.sigmoid(outputs.squeeze()))
            all_labels.extend(labels.cpu().tolist())
            all_predictions.extend(predictions.cpu().tolist())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    cm = confusion_matrix(all_labels, all_predictions)
    
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print('Confusion Matrix:')
    print(cm)

evaluate_model(model, val_loader)


Accuracy: 82.44%
Precision: 0.8560
Recall: 0.7890
F1 Score: 0.8211
Confusion Matrix:
[[2107  339]
 [ 539 2015]]
