In [2]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Cargar los datos preprocesados
train = pd.read_csv('train_clean.csv')

# Descargar recursos necesarios de NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Tokenización y lematización
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

train['clean_message'] = train['clean_message'].apply(preprocess_text)

# Construir el vocabulario
all_tokens = [token for message in train['clean_message'] for token in message]
vocab = list(set(all_tokens))
vocab_to_int = {word: i+1 for i, word in enumerate(vocab)}

# Convertir los textos a secuencias de enteros
def text_to_sequence(text):
    return [vocab_to_int[word] for word in text]

train['clean_message_seq'] = train['clean_message'].apply(text_to_sequence)

# Padding de secuencias
def pad_sequences(sequences, maxlen):
    padded = np.zeros((len(sequences), maxlen), dtype=int)
    for i, seq in enumerate(sequences):
        if len(seq) > maxlen:
            padded[i, :maxlen] = seq[:maxlen]
        else:
            padded[i, :len(seq)] = seq
    return padded

maxlen = max(train['clean_message_seq'].apply(len))
X = pad_sequences(train['clean_message_seq'], maxlen)

# Codificación de etiquetas
encoder = LabelEncoder()
y = encoder.fit_transform(train['label'])

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convertir a tensores
X_train = torch.tensor(X_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Crear DataLoader
train_data = torch.utils.data.TensorDataset(X_train, y_train)
test_data = torch.utils.data.TensorDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Program Files\WindowsApps\PythonSoftwareFoundatio

In [8]:
import torch.nn as nn

class ImprovedRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, n_layers=2, dropout=0.5):
        super(ImprovedRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers=n_layers, 
                          batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # *2 para bidireccionalidad
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # Obtener la salida del último paso de tiempo
        out = self.fc(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab_to_int) + 1
embed_size = 100
hidden_size = 128  # Tamaño de la capa oculta
output_size = 1
n_layers = 2  # Número de capas recurrentes
dropout = 0.5  # Dropout

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ImprovedRNNModel(vocab_size, embed_size, hidden_size, output_size, n_layers, dropout).to(device)



In [9]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Regularización L2

# Entrenar el modelo
n_epochs = 10
model.train()

for epoch in range(n_epochs):
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Mover datos a la GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluar el modelo
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Mover datos a la GPU
        outputs = model(inputs)
        y_pred.extend(outputs.squeeze().cpu().tolist())  # Mover predicciones a la CPU
        y_true.extend(labels.cpu().tolist())  # Mover etiquetas a la CPU

# Convertir predicciones a etiquetas binarias
y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred]

# Evaluar el modelo
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_report(y_true, y_pred))


Epoch 1, Loss: 0.701944545173645
Epoch 2, Loss: 0.697131266117096
Epoch 3, Loss: 0.6958057615280151
Epoch 4, Loss: 0.69754060754776
Epoch 5, Loss: 0.6976643171310425
Epoch 6, Loss: 0.6973668911933899
Epoch 7, Loss: 0.6972004951477051
Epoch 8, Loss: 0.6978647116661072
Epoch 9, Loss: 0.6979470928192139
Epoch 10, Loss: 0.6981397317886353
Accuracy: 0.4892
Classification Report:
              precision    recall  f1-score   support

         0.0       0.49      1.00      0.66      2446
         1.0       0.00      0.00      0.00      2554

    accuracy                           0.49      5000
   macro avg       0.24      0.50      0.33      5000
weighted avg       0.24      0.49      0.32      5000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
