In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import joblib

# Cargar el dataset
df = pd.read_csv('youtoxic_english_1000.csv')

# Preparar los datos
X = df['Text']
y = df['IsToxic'].astype(int)

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenización con DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_texts(texts, tokenizer, max_length=512):
    return tokenizer(
        list(texts),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(X_train, tokenizer)
test_encodings = tokenize_texts(X_test, tokenizer)

# Crear datasets para PyTorch
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(y_test.values))

# Cargar modelo DistilBERT
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Configuración del optimizador
optimizer = AdamW(model.parameters(), lr=5e-5)

# Configuración del dispositivo (CPU o GPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Función de entrenamiento
def train_model(model, train_loader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

# Crear DataLoader para entrenamiento y prueba
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Entrenar el modelo
train_model(model, train_loader, optimizer, device)

# Evaluar el modelo
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Reporte de clasificación
print(classification_report(y_true, y_preds))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 50/50 [03:37<00:00,  4.34s/it, loss=0.634]
Epoch 1: 100%|██████████| 50/50 [03:30<00:00,  4.22s/it, loss=0.162] 
Epoch 2: 100%|██████████| 50/50 [03:29<00:00,  4.19s/it, loss=0.264] 


              precision    recall  f1-score   support

           0       0.80      0.75      0.78        93
           1       0.80      0.84      0.82       107

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200



In [4]:
# Guardar el modelo en un archivo
joblib.dump(model, 'nn_model.joblib')

['nn_model.joblib']

In [5]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer

# Inicializar el tokenizador
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def prepare_input(text_list):
    # Tokenizar y preparar los datos para el modelo
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    return encoded_input

def get_predictions(model, data):
    model.eval()  # Poner el modelo en modo evaluación
    with torch.no_grad():
        outputs = model(**data)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
    return predictions.cpu().numpy()

# Preparar datos de entrenamiento
X_train_encoded = prepare_input(X_train.tolist())
y_train_pred = get_predictions(model, X_train_encoded)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Preparar datos de prueba
X_test_encoded = prepare_input(X_test.tolist())
y_test_pred = get_predictions(model, X_test_encoded)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Cálculo del overfitting
overfitting = train_accuracy - test_accuracy

print("Precisión en entrenamiento:", train_accuracy)
print("Precisión en prueba:", test_accuracy)
print("Overfitting:", overfitting)

print("\nInforme de clasificación en prueba:")
print(classification_report(y_test, y_test_pred))

Precisión en entrenamiento: 0.9825
Precisión en prueba: 0.8
Overfitting: 0.1825

Informe de clasificación en prueba:
              precision    recall  f1-score   support

           0       0.80      0.75      0.78        93
           1       0.80      0.84      0.82       107

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200



In [6]:
test_sentences = [
    "I love how diverse and multicultural our community is!", # No hate
    "You're stupid and worthless because of your race", # Hate
    "Everyone deserves to be treated with respect and dignity", # No hate
    "All [ethnic group] should be banned from this country!", # Hate
    "Had a great time learning about different cultures today" # No hate
]

# Preparar los datos de entrada
test_encoded = prepare_input(test_sentences)

# Obtener predicciones
predictions = get_predictions(model, test_encoded)

# Mostrar resultados
for sentence, pred in zip(test_sentences, predictions):
    label = "HATE SPEECH" if pred == 1 else "NOT HATE SPEECH"
    print(f"\nText: {sentence}")
    print(f"Prediction: {label}")


Text: I love how diverse and multicultural our community is!
Prediction: NOT HATE SPEECH

Text: You're stupid and worthless because of your race
Prediction: HATE SPEECH

Text: Everyone deserves to be treated with respect and dignity
Prediction: NOT HATE SPEECH

Text: All [ethnic group] should be banned from this country!
Prediction: HATE SPEECH

Text: Had a great time learning about different cultures today
Prediction: NOT HATE SPEECH
