# Detecci√≥n de toxicidad en comentarios de Youtube. Modelado avanzado

Este notebook implementa un pipeline completo que incluye:
- Data Augmentation con EDA
- Modelos baseline mejorados
- VotingClassifier (ensemble)
- Fine-tuning de DistilBERT
- Evaluaci√≥n detallada con F1, ROC-AUC y curvas PR

In [None]:
# Descargar EDA
!wget https://raw.githubusercontent.com/jasonwei20/eda_nlp/master/code/eda.py

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import torch

# Importar EDA
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("eda", "eda.py")
eda_module = importlib.util.module_from_spec(spec)
sys.modules["eda"] = eda_module
spec.loader.exec_module(eda_module)

# Descargar datos de NLTK
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
# ===========================================
# FUNCIONES DE DATA AUGMENTATION
# ===========================================

def eda_pipeline(sentence, num_aug=4):
    """
    Pipeline de EDA para generar textos aumentados
    """
    try:
        words = str(sentence).split()
        if len(words) < 2:  # Evitar textos muy cortos
            return [sentence]

        augmented_sentences = []
        num_each = max(1, num_aug // 4)

        # Aplicar las 4 t√©cnicas de EDA
        try:
            augmented_sentences.extend(eda_module.synonym_replacement(words.copy(), num_each))
        except:
            pass

        try:
            augmented_sentences.extend(eda_module.random_insertion(words.copy(), num_each))
        except:
            pass

        try:
            augmented_sentences.extend(eda_module.random_swap(words.copy(), num_each))
        except:
            pass

        try:
            augmented_sentences.extend(eda_module.random_deletion(words.copy(), num_each))
        except:
            pass

        # Convertir a strings y filtrar vac√≠os
        result = []
        for sent in augmented_sentences[:num_aug]:
            if isinstance(sent, list):
                text = " ".join(sent)
            else:
                text = str(sent)
            if len(text.strip()) > 0:
                result.append(text)

        return result if result else [sentence]

    except Exception as e:
        print(f"Error en EDA: {e}")
        return [sentence]

def apply_eda_safe(df, label_column='IsToxic', text_column='text_processed', num_aug=3):
    """
    Aplicar EDA con manejo de errores para evitar problemas en Colab
    """
    try:
        df_minority = df[df[label_column] == 1].copy()
        augmented_texts = []

        print(f"üìä Aplicando EDA a {len(df_minority)} muestras t√≥xicas...")

        successful_augmentations = 0

        for idx, row in df_minority.iterrows():
            try:
                # Verificar que el texto no est√© vac√≠o
                if pd.isna(row[text_column]) or len(str(row[text_column]).strip()) == 0:
                    continue

                aug_texts = eda_pipeline(str(row[text_column]), num_aug=num_aug)

                for aug_text in aug_texts:
                    if len(aug_text.strip()) > 0:  # Verificar que el texto aumentado no est√© vac√≠o
                        augmented_texts.append({
                            text_column: aug_text,
                            label_column: 1
                        })
                        successful_augmentations += 1

            except Exception as e:
                continue

        if augmented_texts:
            df_augmented = pd.DataFrame(augmented_texts)
            df_final = pd.concat([df, df_augmented], ignore_index=True)
            print(f"‚úÖ EDA completado. {successful_augmentations} textos aumentados generados")
        else:
            print("‚ö†Ô∏è No se generaron textos aumentados, continuando con dataset original")
            df_final = df

        print(f"üìä Dataset final: {len(df_final)} muestras")
        return df_final

    except Exception as e:
        print(f"‚ùå Error en EDA: {str(e)}")
        print("Continuando sin data augmentation...")
        return df

In [None]:
# ===========================================
# FUNCIONES DE ENSEMBLE
# ===========================================

def train_voting_ensemble(df, label_column='IsToxic', text_column='text_processed'):
    """
    Entrenar un ensemble de clasificadores usando VotingClassifier
    """
    try:
        print("üéØ Preparando datos para VotingClassifier...")

        X = df[text_column].fillna('').astype(str)
        y = df[label_column]

        # Vectorizaci√≥n TF-IDF
        print("üìä Aplicando vectorizaci√≥n TF-IDF...")
        tfidf = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english',
            min_df=2,
            max_df=0.95
        )
        X_vec = tfidf.fit_transform(X)

        # Divisi√≥n train-test
        X_train, X_test, y_train, y_test = train_test_split(
            X_vec, y, test_size=0.2, stratify=y, random_state=42
        )

        print(f"üìà Datos de entrenamiento: {X_train.shape[0]} muestras")
        print(f"üìà Datos de prueba: {X_test.shape[0]} muestras")

        # Definir clasificadores
        clf1 = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
        clf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
        clf3 = SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)

        # Crear ensemble
        voting = VotingClassifier(
            estimators=[
                ('lr', clf1),
                ('rf', clf2),
                ('svm', clf3)
            ],
            voting='soft'
        )

        print("üöÄ Entrenando VotingClassifier...")
        voting.fit(X_train, y_train)

        # Predicciones
        y_pred = voting.predict(X_test)
        y_prob = voting.predict_proba(X_test)[:, 1]

        # M√©tricas
        print("\nüìä RESULTADOS VotingClassifier:")
        print("="*50)
        print(classification_report(y_test, y_pred))
        print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
        print(f"Average Precision: {average_precision_score(y_test, y_prob):.4f}")

        return voting, tfidf

    except Exception as e:
        print(f"‚ùå Error en VotingClassifier: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

In [None]:
# ===========================================
# FUNCIONES DE FINE-TUNING
# ===========================================

def fine_tune_distilbert(df, label_column='IsToxic', text_column='text_cleaned'):
    """
    Fine-tuning de DistilBERT con par√°metros corregidos para compatibilidad
    """
    try:
        print("ü§ñ Iniciando fine-tuning de DistilBERT...")

        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        df['label'] = df[label_column].astype(int)

        # Limpiar textos
        texts = df[text_column].fillna('').astype(str).tolist()
        labels = df['label'].tolist()

        # Dividir datos
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=0.2, stratify=labels, random_state=42
        )

        print(f"üìä Datos de entrenamiento: {len(train_texts)} muestras")
        print(f"üìä Datos de validaci√≥n: {len(val_texts)} muestras")

        # Tokenizaci√≥n
        print("üî§ Tokenizando textos...")
        train_encodings = tokenizer(
            train_texts,
            truncation=True,
            padding=True,
            max_length=256  # Reducido para usar menos memoria
        )
        val_encodings = tokenizer(
            val_texts,
            truncation=True,
            padding=True,
            max_length=256
        )

        # Crear datasets
        train_dataset = Dataset.from_dict({**train_encodings, 'label': train_labels})
        val_dataset = Dataset.from_dict({**val_encodings, 'label': val_labels})

        # Cargar modelo
        model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=2
        )

        # Argumentos de entrenamiento corregidos
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=2,  # Reducido para CPU
            per_device_train_batch_size=8,  # Reducido para CPU
            per_device_eval_batch_size=16,
            learning_rate=2e-5,
            eval_strategy="epoch",  # Par√°metro corregido
            save_strategy="epoch",
            logging_dir='./logs',
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            save_total_limit=1,
            weight_decay=0.01,
            logging_steps=50,
            warmup_steps=100,
            remove_unused_columns=False,
            report_to=[],  # Desactivar completamente wandb
            dataloader_pin_memory=False,
            run_name="toxicity_detection_run"  # Nombre espec√≠fico para el run
        )

        def compute_metrics(eval_pred):
            """Funci√≥n para calcular m√©tricas durante el entrenamiento"""
            logits, labels = eval_pred
            preds = np.argmax(logits, axis=-1)

            # Calcular m√©tricas
            f1 = f1_score(labels, preds, average='weighted')

            # Para ROC-AUC necesitamos las probabilidades
            probs = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
            auc = roc_auc_score(labels, probs)
            avgp = average_precision_score(labels, probs)

            return {
                "f1": f1,
                "roc_auc": auc,
                "avg_precision": avgp
            }

        # Crear trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        # Entrenar modelo
        print("üöÄ Entrenando modelo...")
        train_output = trainer.train()

        # Evaluar modelo
        print("üìä Evaluando modelo...")
        eval_output = trainer.evaluate()

        # Mostrar resultados
        print("\nüìà RESULTADOS FINALES DistilBERT:")
        print("="*50)
        print(f"F1-score: {eval_output['eval_f1']:.4f}")
        print(f"ROC-AUC: {eval_output['eval_roc_auc']:.4f}")
        print(f"Avg Precision: {eval_output['eval_avg_precision']:.4f}")
        print(f"Loss: {eval_output['eval_loss']:.4f}")

        # An√°lisis de overfitting
        print("\nüîç AN√ÅLISIS DE OVERFITTING:")
        print("="*50)

        # Obtener m√©tricas de entrenamiento del √∫ltimo epoch
        train_logs = trainer.state.log_history

        # Filtrar logs de entrenamiento y evaluaci√≥n
        train_metrics = [log for log in train_logs if 'train_loss' in log]
        eval_metrics = [log for log in train_logs if 'eval_loss' in log]

        if train_metrics and eval_metrics:
            final_train_loss = train_metrics[-1]['train_loss']
            final_eval_loss = eval_metrics[-1]['eval_loss']

            loss_gap = final_eval_loss - final_train_loss

            print(f"üìä Train Loss: {final_train_loss:.4f}")
            print(f"üìä Eval Loss: {final_eval_loss:.4f}")
            print(f"üìä Gap (Eval - Train): {loss_gap:.4f}")

            # Interpretaci√≥n del overfitting
            if loss_gap > 0.3:
                print("üö® OVERFITTING ALTO - Gap > 0.3")
                print("   Recomendaciones:")
                print("   - Reducir epochs o learning rate")
                print("   - Aumentar regularizaci√≥n (weight_decay)")
                print("   - Usar m√°s datos de entrenamiento")
            elif loss_gap > 0.1:
                print("‚ö†Ô∏è OVERFITTING MODERADO - Gap > 0.1")
                print("   Recomendaciones:")
                print("   - Monitorear m√°s de cerca")
                print("   - Considerar early stopping")
            else:
                print("‚úÖ OVERFITTING BAJO - Gap <= 0.1")
                print("   El modelo generaliza bien")

            # Evoluci√≥n durante el entrenamiento
            print(f"\nüìà EVOLUCI√ìN DEL LOSS:")
            for i, (train_log, eval_log) in enumerate(zip(train_metrics, eval_metrics)):
                epoch = i + 1
                train_loss = train_log['train_loss']
                eval_loss = eval_log['eval_loss']
                gap = eval_loss - train_loss
                print(f"   Epoch {epoch}: Train={train_loss:.4f}, Eval={eval_loss:.4f}, Gap={gap:.4f}")

        return trainer, eval_output

    except Exception as e:
        print(f"‚ùå Error en fine-tuning: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

In [None]:
# ===========================================
# FUNCIONES DE UTILIDAD
# ===========================================

def check_gpu_memory():
    """Verificar el uso de memoria GPU"""
    if torch.cuda.is_available():
        print(f"üî• GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
        print(f"Memoria libre: {torch.cuda.memory_reserved(0) / 1024**3:.1f} GB")
    else:
        print("‚ö†Ô∏è GPU no disponible, usando CPU")

def clear_memory():
    """Limpiar memoria GPU y cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    import gc
    gc.collect()
    print("üßπ Memoria limpiada")

In [None]:
# ===========================================
# PIPELINE PRINCIPAL
# ===========================================

def run_complete_pipeline(df_path="dataset_processed_complete.csv"):
    """
    Ejecutar el pipeline completo con manejo de errores
    """
    try:
        # Verificar GPU
        check_gpu_memory()

        # Cargar datos
        print("üìÅ Cargando dataset...")
        df = pd.read_csv(df_path)
        print(f"Dataset cargado: {len(df)} filas")

        # Mostrar informaci√≥n b√°sica del dataset
        print(f"üìä Distribuci√≥n de clases:")
        print(df['IsToxic'].value_counts())

        # Verificar columnas requeridas
        required_columns = ['IsToxic', 'text_processed', 'text_cleaned']
        missing_columns = [col for col in required_columns if col not in df.columns]

        if missing_columns:
            print(f"‚ö†Ô∏è Columnas faltantes: {missing_columns}")
            print("Columnas disponibles:", df.columns.tolist())

            # Intentar usar columnas alternativas
            if 'text_processed' not in df.columns and 'text' in df.columns:
                df['text_processed'] = df['text']
                print("‚úÖ Usando 'text' como 'text_processed'")

            if 'text_cleaned' not in df.columns and 'text' in df.columns:
                df['text_cleaned'] = df['text']
                print("‚úÖ Usando 'text' como 'text_cleaned'")

        # Aplicar EDA con manejo de errores
        print("\nüîÑ Aplicando Data Augmentation...")
        df_aug = apply_eda_safe(df, text_column='text_processed', num_aug=2)

        # Entrenar ensemble
        print("\nüéØ Entrenando VotingClassifier...")
        voting_model, tfidf_vectorizer = train_voting_ensemble(df_aug, text_column='text_processed')

        # Fine-tuning DistilBERT (solo si hay suficiente memoria)
        if len(df_aug) > 100:  # Solo si hay suficientes datos
            print("\nü§ñ Iniciando fine-tuning de DistilBERT...")
            trainer, bert_results = fine_tune_distilbert(df_aug, text_column='text_cleaned')
            return voting_model, tfidf_vectorizer, trainer, bert_results
        else:
            print("‚ö†Ô∏è Dataset muy peque√±o, saltando fine-tuning de BERT")
            return voting_model, tfidf_vectorizer, None, None

    except Exception as e:
        print(f"‚ùå Error en pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None, None, None

In [None]:
# Ejecutar pipeline completo

# desactivar wandb
import os
os.environ["WANDB_DISABLED"] = "true"

print("üöÄ Iniciando pipeline de detecci√≥n de toxicidad...")
print("="*60)

voting_model, tfidf_vectorizer, trainer, bert_results = run_complete_pipeline()

if voting_model is not None:
    print("\nüéâ Pipeline completado exitosamente!")
    print("‚úÖ VotingClassifier entrenado")
    if trainer is not None:
        print("‚úÖ DistilBERT fine-tuneado")
else:
    print("\n‚ùå Pipeline fall√≥. Revisa los errores arriba.")