In [2]:
import pandas as pd
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Cargar el dataset
df = pd.read_csv('youtoxic_english_1000.csv')

# Preparar los datos
X = df['Text']
y = df['IsToxic'].astype(int)  # Asegurarse de que las etiquetas sean enteros

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Cargar el modelo de SpaCy en inglés
nlp = spacy.load('en_core_web_sm')

# Función para procesar el texto con SpaCy
def process_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

# Aplicar el procesamiento de texto
X_train_processed = X_train.apply(process_text)
X_val_processed = X_val.apply(process_text)
X_test_processed = X_test.apply(process_text)

# Vectorizar el texto
vectorizer = CountVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_val_vectorized = vectorizer.transform(X_val_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Convertir a tensores de PyTorch
X_train_tensor = torch.FloatTensor(X_train_vectorized.toarray())
y_train_tensor = torch.FloatTensor(y_train.values)
X_val_tensor = torch.FloatTensor(X_val_vectorized.toarray())
y_val_tensor = torch.FloatTensor(y_val.values)
X_test_tensor = torch.FloatTensor(X_test_vectorized.toarray())
y_test_tensor = torch.FloatTensor(y_test.values)

# Definir el modelo de red neuronal
class NeuralNet(nn.Module):
    def __init__(self, input_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.sigmoid(self.fc3(out))
        return out

# Crear el modelo
input_size = X_train_vectorized.shape[1]
model = NeuralNet(input_size)

# Definir la función de pérdida y el optimizador
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entrenar el modelo
num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]

        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor.unsqueeze(1))
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Val Loss: {val_loss.item():.4f}')

# Evaluación final
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_preds = (test_outputs > 0.5).float()
    test_accuracy = (test_preds.squeeze() == y_test_tensor).float().mean()

print(f'Final Test Accuracy: {test_accuracy.item():.4f}')
print('\nTest Classification Report:')
print(classification_report(y_test.tolist(), test_preds.squeeze().tolist()))

# Guardar el modelo y el vectorizador
joblib.dump(model.state_dict(), 'nn_model.pth')
joblib.dump(vectorizer, 'nn_vectorizer.joblib')

print("Modelo de red neuronal y vectorizador guardados exitosamente.")

# Función para predecir
def predict_hate_speech(text):
    processed_text = process_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    input_tensor = torch.FloatTensor(vectorized_text.toarray())
    with torch.no_grad():
        output = model(input_tensor)
    prediction = (output > 0.5).float().item()
    return "Frase odiosa" if prediction == 1 else "Frase no odiosa"

# Ejemplo de uso
texto_ejemplo = "You are an idiot and I hate you"
resultado = predict_hate_speech(texto_ejemplo)
print(f"Predicción: {resultado}")

Epoch [10/100], Val Loss: 1.8729
Epoch [20/100], Val Loss: 2.6904
Epoch [30/100], Val Loss: 4.2521
Epoch [40/100], Val Loss: 5.1044
Epoch [50/100], Val Loss: 5.3306
Epoch [60/100], Val Loss: 5.5255
Epoch [70/100], Val Loss: 5.6962
Epoch [80/100], Val Loss: 6.3992
Epoch [90/100], Val Loss: 6.5268
Epoch [100/100], Val Loss: 7.1963
Final Test Accuracy: 0.7333

Test Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.88      0.78        80
           1       0.80      0.57      0.67        70

    accuracy                           0.73       150
   macro avg       0.75      0.72      0.72       150
weighted avg       0.75      0.73      0.73       150

Modelo de red neuronal y vectorizador guardados exitosamente.
Predicción: Frase odiosa
