In [54]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaForSequenceClassification
from sklearn.model_selection import KFold, StratifiedGroupKFold, train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.nn import functional as F 
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import pickle
import os
import random

In [14]:
df = pd.read_csv("../data/youtoxic_english_processed.csv")
df = df[["clean_text", "IsToxic"]]
df.head()


Unnamed: 0,clean_text,IsToxic
0,people would take step back make case anyone e...,0
1,law enforcement trained shoot apprehend traine...,1
2,dont reckon black lives matter banners held wh...,1
3,large number people like police officers calle...,0
4,arab dude absolutely right shot extra time sho...,0


In [40]:
# Tokenización
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(clean_text):
    return tokenizer(
        clean_text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
tokenized_data = tokenize_data(df["clean_text"].tolist())



In [41]:
# Crear dataset
class ToxicDataset(Dataset):
    def __init__(self, tokenized_data, labels, augment=False):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.augment = augment
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }


In [57]:
# Función de entrenamiento y evaluación
def train_one_epoch(model, dataloader, optimizer, device, scheduler, class_weights):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    accumulation_steps = 2
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc="Training", total=len(dataloader))
    for i, batch in enumerate(progress_bar):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        weighted_loss = loss * class_weights[labels]
        loss = weighted_loss.mean()
        
        loss.backward()
        
        if (i + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}', 
            'accuracy': f'{correct / total:.4f}'
            })
        
    return total_loss / len(dataloader), correct / total

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += len(labels)
        if total == 0:
            return 0.0, 0.0
        
    return total_loss / len(dataloader), correct / total

        


In [61]:
# Entrenamiento con cross validation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Parámetros
batch_size = 16
num_epochs = 8
learning_rate = 2e-5
patience = 5
dropout_rate = 0.2
weight_decay = 0.01

# Calcular pesos de las clases
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(df['IsToxic']),
    y=df['IsToxic']
)
class_weights = torch.tensor(
    class_weights, 
    dtype=torch.float
    ).to(device)

# Verificar la distribución de los datos
print("\nDistribución de clases:")
print(df['IsToxic'].value_counts())

# Preparamos K-Fold
train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.2,
    stratify=df['IsToxic'],
    random_state=42
)

print(f"\nTamaño del conjunto de entrenamiento: {len(train_idx)}")
print(f"Tamaño del conjunto de validación: {len(val_idx)}")
    
# Verificar distribución de clases en cada split
train_dist = df.iloc[train_idx]['IsToxic'].value_counts()
val_dist = df.iloc[val_idx]['IsToxic'].value_counts()
print("\nDistribución de clases en entrenamiento:")
print(train_dist)
print("\nDistribución de clases en validación:")
print(val_dist)

# Ajustar batch_size basado en el tamaño de los datasets
actual_batch_size = min(
    batch_size,
    len(train_idx) // 10,
    len(val_idx)
    )
actual_batch_size = max(1, actual_batch_size)  # Asegurar que no sea 0
print(f"Batch size ajustado: {actual_batch_size}")
# Crear data set para este fold
train_dataset = ToxicDataset({
    "input_ids": tokenized_data["input_ids"][train_idx],
    "attention_mask": tokenized_data["attention_mask"][train_idx],
    }, df["IsToxic"].iloc[train_idx].values, augment=True)

val_dataset = ToxicDataset({
    "input_ids": tokenized_data["input_ids"][val_idx],
    "attention_mask": tokenized_data["attention_mask"][val_idx],
    }, df["IsToxic"].iloc[val_idx].values, augment=False)
    
    # Verificar que los datasets no estén vacíos
if len(train_dataset) == 0 or len(val_dataset) == 0:
    print("Error: Dataset vacío")

# Crear dataloaders
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=actual_batch_size, 
    shuffle=True,
    drop_last=False
)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=actual_batch_size,
    shuffle=False,
    drop_last=False
)

# Inicializar modelo para cada fold
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    hidden_dropout_prob=dropout_rate,
    attention_probs_dropout_prob=dropout_rate,
    classifier_dropout=dropout_rate,
    hidden_act="gelu"
    ).to(device)

# Optimizador
optimizer = AdamW(
    model.parameters(), 
    lr=learning_rate,
    weight_decay=weight_decay,
    eps=1e-8,
    betas=(0.9, 0.999)
    )
# Scheduler
num_training_steps = len(train_dataloader) * num_epochs
num_warmup_steps = num_training_steps // 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

#Early stopping setup
best_val_accuracy = 0
patience_counter = 0

# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train
    train_loss, train_accuracy = train_one_epoch(
        model, 
        train_dataloader, 
        optimizer, 
        device, 
        scheduler,
        class_weights
        )
    
    # Evaluate
    val_loss, val_accuracy = evaluate(model, val_dataloader, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    # Early stopping check
    if abs(train_accuracy - val_accuracy) > 0.1: # 10% de diferencia
        print("Warning: Posible overfitting detectado")
        if patience_counter == 0:
            # Reducir learning rate
            for param_group in optimizer.param_groups:
                param_group['lr'] = param_group['lr'] * 0.5
            print(f"Learning rate reducida a {optimizer.param_groups[0]['lr']:.6f}")

    if val_accuracy > best_val_accuracy:
        current_overfitting = abs(train_accuracy - val_accuracy) * 100
        
        if current_overfitting <= 5:  # Solo guardamos si el overfitting es menor al 5%
            best_val_accuracy = val_accuracy
            best_train_accuracy = train_accuracy
            best_overfitting = current_overfitting
            patience_counter = 0
            
            # Guardar el mejor modelo
            if not os.path.exists('../models'):
                os.makedirs('../models')
            
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_accuracy': train_accuracy,
                'val_accuracy': val_accuracy,
                'overfitting': current_overfitting,
                'epoch': epoch
            }, '../models/toxic_comment_model.pt')
            
            print(f"\nGuardado nuevo mejor modelo:")
            print(f"Accuracy de validación: {val_accuracy:.4f}")
            print(f"Accuracy de entrenamiento: {train_accuracy:.4f}")
            print(f"Overfitting: {current_overfitting:.2f}%")
        else:
            print(f"No se guardó el modelo - Overfitting ({current_overfitting:.2f}%) > 5%")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Al final del entrenamiento (después del loop)
try:
    # Cargar el mejor modelo guardado
    checkpoint = torch.load('../models/toxic_comment_model.pt')
    print("\nMejor modelo guardado:")
    print(f"Accuracy de validación: {checkpoint['val_accuracy']:.4f}")
    print(f"Accuracy de entrenamiento: {checkpoint['train_accuracy']:.4f}")
    print(f"Overfitting: {checkpoint['overfitting']:.2f}%")
except FileNotFoundError:
    print("No se encontró ningún modelo con overfitting < 5%")
# Calcular el porcentaje de overfitting
overfitting_percentage = abs(train_accuracy - val_accuracy) * 100

print(f"\nMejor accuracy del modelo: {best_val_accuracy:.4f}")

# Resultados finales
print("\nResultados finales:")
print(f"Mejor Accuracy: {best_val_accuracy:.4f}")
print(f"Loss final: {val_loss:.4f}")
print(f"Mejor Accuracy de entrenamiento: {best_train_accuracy:.4f}")
print(f"Mejor Accuracy de validación: {best_val_accuracy:.4f}")
print(f"Porcentaje de overfitting: {overfitting_percentage:.2f}%")

# Evaluación del overfitting
if overfitting_percentage > 10:
    print("WARNING: Alto nivel de overfitting detectado!")
elif overfitting_percentage > 5:
    print("AVISO: Nivel moderado de overfitting")
else:
    print("BIEN: Nivel de overfitting aceptable")



Using device: cpu

Distribución de clases:
IsToxic
0    538
1    462
Name: count, dtype: int64

Tamaño del conjunto de entrenamiento: 800
Tamaño del conjunto de validación: 200

Distribución de clases en entrenamiento:
IsToxic
0    430
1    370
Name: count, dtype: int64

Distribución de clases en validación:
IsToxic
0    108
1     92
Name: count, dtype: int64
Batch size ajustado: 16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/8


Training: 100%|██████████| 50/50 [02:00<00:00,  2.41s/it, loss=0.6432, accuracy=0.5238]


Train Loss: 0.6955, Train Accuracy: 0.5238
Validation Loss: 0.6860, Validation Accuracy: 0.5250

Guardado nuevo mejor modelo:
Accuracy de validación: 0.5250
Accuracy de entrenamiento: 0.5238
Overfitting: 0.12%

Epoch 2/8


Training: 100%|██████████| 50/50 [01:57<00:00,  2.36s/it, loss=0.7407, accuracy=0.5425]


Train Loss: 0.6875, Train Accuracy: 0.5425
Validation Loss: 0.6627, Validation Accuracy: 0.7200
Learning rate reducida a 0.000010
No se guardó el modelo - Overfitting (17.75%) > 5%

Epoch 3/8


Training: 100%|██████████| 50/50 [01:56<00:00,  2.33s/it, loss=0.5806, accuracy=0.6238]


Train Loss: 0.6494, Train Accuracy: 0.6238
Validation Loss: 0.6030, Validation Accuracy: 0.7700
Learning rate reducida a 0.000009
No se guardó el modelo - Overfitting (14.62%) > 5%

Epoch 4/8


Training: 100%|██████████| 50/50 [01:54<00:00,  2.30s/it, loss=0.5852, accuracy=0.7100]


Train Loss: 0.5971, Train Accuracy: 0.7100
Validation Loss: 0.5460, Validation Accuracy: 0.7450

Guardado nuevo mejor modelo:
Accuracy de validación: 0.7450
Accuracy de entrenamiento: 0.7100
Overfitting: 3.50%

Epoch 5/8


Training: 100%|██████████| 50/50 [01:58<00:00,  2.37s/it, loss=0.3825, accuracy=0.7612]


Train Loss: 0.5142, Train Accuracy: 0.7612
Validation Loss: 0.5065, Validation Accuracy: 0.7800

Guardado nuevo mejor modelo:
Accuracy de validación: 0.7800
Accuracy de entrenamiento: 0.7612
Overfitting: 1.88%

Epoch 6/8


Training: 100%|██████████| 50/50 [01:56<00:00,  2.34s/it, loss=0.2996, accuracy=0.7562]


Train Loss: 0.4835, Train Accuracy: 0.7562
Validation Loss: 0.5333, Validation Accuracy: 0.7450

Epoch 7/8


Training: 100%|██████████| 50/50 [01:56<00:00,  2.33s/it, loss=0.2532, accuracy=0.8125]


Train Loss: 0.4293, Train Accuracy: 0.8125
Validation Loss: 0.4971, Validation Accuracy: 0.7800

Epoch 8/8


Training: 100%|██████████| 50/50 [01:55<00:00,  2.31s/it, loss=0.4163, accuracy=0.8425]


Train Loss: 0.3654, Train Accuracy: 0.8425
Validation Loss: 0.5292, Validation Accuracy: 0.7550


  checkpoint = torch.load('../models/toxic_comment_model.pt')



Mejor modelo guardado:
Accuracy de validación: 0.7800
Accuracy de entrenamiento: 0.7612
Overfitting: 1.88%

Mejor accuracy del modelo: 0.7800

Resultados finales:
Mejor Accuracy: 0.7800
Loss final: 0.5292
Mejor Accuracy de entrenamiento: 0.7612
Mejor Accuracy de validación: 0.7800
Porcentaje de overfitting: 8.75%
AVISO: Nivel moderado de overfitting


In [60]:
# Guardar el mejor modelo
print(f"Mejor accuracy alcanzado: {best_val_accuracy:.4f}")

# Asegurarnos de que el directorio existe
if not os.path.exists('../models'):
    os.makedirs('../models')

# Calcular overfitting final
final_overfitting = abs(best_train_accuracy - best_val_accuracy) * 100

try:
    if final_overfitting <= 5:  # Solo guardamos si el overfitting es menor al 5%
        # Guardar el modelo completo en formato .pt
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_accuracy': best_train_accuracy,
            'val_accuracy': best_val_accuracy,
            'overfitting': final_overfitting,
            'epoch': epoch
        }, '../models/toxic_comment_model.pt')
        
        # Guardar el tokenizer
        tokenizer.save_pretrained('../models/toxic_comment_tokenizer')
        print("\nModelo y tokenizer guardados exitosamente")
        print(f"Accuracy de validación: {best_val_accuracy:.4f}")
        print(f"Accuracy de entrenamiento: {best_train_accuracy:.4f}")
        print(f"Overfitting: {final_overfitting:.2f}%")
    else:
        print(f"\nNo se guardó el modelo porque el overfitting ({final_overfitting:.2f}%) es mayor al 5%")
        print("Considera reentrenar el modelo con diferentes hiperparámetros")

except Exception as e:
    print(f"Error al guardar el modelo: {e}")

# Verificar que se puede cargar el modelo si fue guardado
if final_overfitting <= 5:
    try:
        # Cargar el modelo para verificar
        checkpoint = torch.load('../models/toxic_comment_model.pt')
        model.load_state_dict(checkpoint['model_state_dict'])
        print("Modelo verificado correctamente")
    except Exception as e:
        print(f"Error al verificar el modelo: {e}")



Mejor accuracy alcanzado: 0.7350

No se guardó el modelo porque el overfitting (10.13%) es mayor al 5%
Considera reentrenar el modelo con diferentes hiperparámetros
