# 🧼 Preprocesamiento y limpieza de texto

Este notebook implementa un pipeline completo de preprocesamiento de texto
específicamente diseñado para comentarios de YouTube basado en el EDA realizado.

Basado en el análisis previo:
- 1,000 comentarios con longitud promedio de 186 caracteres
- 33.8 palabras promedio por comentario
- 46.2% de comentarios tóxicos
- Necesidad de limpieza específica para comentarios de redes sociales

## Funciones principales del preprocesamiento

### Librerías

In [None]:
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')

# Librerías para procesamiento de texto
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag

# Para manejo de URLs y HTML
from urllib.parse import urlparse
import html

# Para visualización del progreso
from tqdm import tqdm
tqdm.pandas()

# Configuración de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

### Descargas necesarias de NLTK

In [None]:
def download_nltk_resources():
    """Descarga los recursos necesarios de NLTK - versión actualizada"""
    nltk_downloads = [
        'punkt', 'punkt_tab', 'stopwords', 'wordnet', 'averaged_perceptron_tagger',
        'maxent_ne_chunker', 'words', 'omw-1.4', 'averaged_perceptron_tagger_eng'
    ]
    
    print("📥 Descargando recursos de NLTK...")
    for resource in nltk_downloads:
        try:
            nltk.download(resource, quiet=True)
            print(f"✅ {resource} descargado")
        except Exception as e:
            print(f"⚠️ No se pudo descargar {resource}: {str(e)}")
    print("✅ Recursos de NLTK listos")

# Ejecutar descargas
download_nltk_resources()

### Configuración inicial

Antes de ejecutar la siguiente celda, es importante haber instalado spaCy:
```python
pip install spacy
```

Una vez instalado, es necesario descargar el modelo de idioma en inglés que se usa en este pipeline. Para ello, ejecutar: 

```python
python -m spacy download en_core_web_sm
```

In [None]:
try:
    nlp = spacy.load("en_core_web_sm")
    SPACY_AVAILABLE = True
    print("✅ SpaCy modelo cargado correctamente")
except:
    SPACY_AVAILABLE = False
    print("⚠️ SpaCy no disponible, usando solo NLTK")

# Inicializar herramientas de NLTK
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# Cargar stopwords con manejo de errores
try:
    stop_words = set(stopwords.words('english'))
    print("✅ Stopwords cargadas correctamente")
except:
    # Stopwords básicas en caso de error
    stop_words = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", 
                 "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 
                 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 
                 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
                 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 
                 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
                 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
                 'while', 'of', 'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after', 
                 'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
                 'further', 'then', 'once'}
    print("⚠️ Usando stopwords básicas")

### Funciones de limpieza

In [None]:
def clean_html_entities(text):
    """Limpia entidades HTML y caracteres especiales"""
    if pd.isna(text):
        return ""
    
    # Decodificar entidades HTML
    text = html.unescape(text)
    
    # Limpiar entidades HTML comunes manualmente
    html_entities = {
        '&amp;': '&', '&lt;': '<', '&gt;': '>', '&quot;': '"',
        '&#39;': "'", '&nbsp;': ' ', '&copy;': '', '&reg;': ''
    }
    
    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)
    
    return text

In [None]:
def remove_urls(text):
    """Elimina URLs y enlaces de texto"""
    if pd.isna(text):
        return ""
    
    # Patrones para URLs
    url_patterns = [
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}',
    ]
    
    for pattern in url_patterns:
        text = re.sub(pattern, ' [URL] ', text, flags=re.IGNORECASE)
    
    return text

In [None]:
def clean_social_media_artifacts(text):
    """Limpia artefactos específicos de redes sociales"""
    if pd.isna(text):
        return ""
    
    # Menciones de usuario
    text = re.sub(r'@[A-Za-z0-9_]+', ' [USER] ', text)
    
    # Hashtags (mantener el contenido pero quitar el #)
    text = re.sub(r'#([A-Za-z0-9_]+)', r' \1 ', text)
    
    # Repeticiones excesivas de caracteres (ej: "sooooo" -> "so")
    text = re.sub(r'(.)\1{3,}', r'\1\1', text)
    
    # Múltiples signos de puntuación
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    text = re.sub(r'[.]{3,}', '...', text)
    
    return text

In [None]:
def remove_emojis_and_special_chars(text):
    """Elimina emojis y caracteres especiales"""
    if pd.isna(text):
        return ""
    
    # Patrón para emojis (rango Unicode amplio)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # símbolos & pictogramas
        "\U0001F680-\U0001F6FF"  # transporte & símbolos del mapa
        "\U0001F1E0-\U0001F1FF"  # banderas (iOS)
        "\U00002702-\U000027B0"  # dingbats
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    
    text = emoji_pattern.sub(' [EMOJI] ', text)
    
    # Eliminar caracteres de control y no ASCII (excepto espacios y algunos signos)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Limpiar caracteres especiales pero mantener puntuación básica
    text = re.sub(r'[^\w\s\.\!\?\,\;\:\'\"\-\(\)]', ' ', text)
    
    return text

In [None]:
def standardize_contractions(text):
    """Expande contracciones comunes en inglés"""
    if pd.isna(text):
        return ""
    
    contractions = {
        "ain't": "is not", "aren't": "are not", "can't": "cannot",
        "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
        "don't": "do not", "hadn't": "had not", "hasn't": "has not",
        "haven't": "have not", "he'd": "he would", "he'll": "he will",
        "he's": "he is", "i'd": "i would", "i'll": "i will",
        "i'm": "i am", "i've": "i have", "isn't": "is not",
        "it'd": "it would", "it'll": "it will", "it's": "it is",
        "let's": "let us", "shouldn't": "should not", "that's": "that is",
        "there's": "there is", "they'd": "they would", "they'll": "they will",
        "they're": "they are", "they've": "they have", "we'd": "we would",
        "we're": "we are", "we've": "we have", "weren't": "were not",
        "what's": "what is", "where's": "where is", "who's": "who is",
        "won't": "will not", "wouldn't": "would not", "you'd": "you would",
        "you'll": "you will", "you're": "you are", "you've": "you have"
    }
    
    # Aplicar contracciones (case-insensitive)
    for contraction, expansion in contractions.items():
        text = re.sub(re.escape(contraction), expansion, text, flags=re.IGNORECASE)
    
    return text

In [None]:
def clean_whitespace(text):
    """Limpia espacios en blanco excesivos"""
    if pd.isna(text):
        return ""
    
    # Eliminar espacios múltiples
    text = re.sub(r'\s+', ' ', text)
    
    # Eliminar espacios al inicio y final
    text = text.strip()
    
    return text

In [None]:
def apply_basic_cleaning(text):
    """Pipeline básico de limpieza"""
    if pd.isna(text):
        return ""
    
    # Convertir a minúsculas
    text = text.lower()
    
    # Aplicar todas las funciones de limpieza
    text = clean_html_entities(text)
    text = remove_urls(text)
    text = clean_social_media_artifacts(text)
    text = remove_emojis_and_special_chars(text)
    text = standardize_contractions(text)
    text = clean_whitespace(text)
    
    return text

### Tokenización y filtrado

In [None]:
def advanced_tokenize(text, remove_stopwords=True, min_word_length=2):
    """Tokenización avanzada con filtrado"""
    if pd.isna(text) or text == "":
        return []
    
    # Tokenizar
    tokens = word_tokenize(text.lower())
    
    # Filtrar tokens
    filtered_tokens = []
    for token in tokens:
        # Saltar si es stopword (opcional)
        if remove_stopwords and token in stop_words:
            continue
        
        # Saltar si es muy corto
        if len(token) < min_word_length:
            continue
        
        # Saltar si es solo puntuación
        if token in string.punctuation:
            continue
        
        # Saltar si es solo números
        if token.isdigit():
            continue
        
        filtered_tokens.append(token)
    
    return filtered_tokens

In [None]:
def lemmatize_tokens(tokens):
    """Aplica lematización a los tokens"""
    if not tokens:
        return []
    
    lemmatized = []
    for token in tokens:
        try:
            lemmatized.append(lemmatizer.lemmatize(token))
        except:
            lemmatized.append(token)
    
    return lemmatized

In [None]:
def stem_tokens(tokens):
    """Aplica stemming a los tokens"""
    if not tokens:
        return []
    
    stemmed = []
    for token in tokens:
        try:
            stemmed.append(stemmer.stem(token))
        except:
            stemmed.append(token)
    
    return stemmed

In [None]:
def spacy_process(text):
    """Procesamiento con SpaCy (si está disponible)"""
    if not SPACY_AVAILABLE or pd.isna(text):
        return {'tokens': [], 'lemmas': [], 'pos_tags': [], 'entities': []}
    
    try:
        doc = nlp(text)
        
        # Extraer información
        tokens = [token.text for token in doc if not token.is_space]
        lemmas = [token.lemma_ for token in doc if not token.is_space]
        pos_tags = [(token.text, token.pos_) for token in doc if not token.is_space]
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        
        return {
            'tokens': tokens,
            'lemmas': lemmas,
            'pos_tags': pos_tags,
            'entities': entities
        }
    except:
        return {'tokens': [], 'lemmas': [], 'pos_tags': [], 'entities': []}

### Detección de patrones

In [None]:
def detect_toxic_patterns(text):
    """Detecta patrones específicos de toxicidad"""
    if pd.isna(text):
        return {}
    
    text_lower = text.lower()
    
    patterns = {
        'profanity_count': len(re.findall(r'\b(fuck|shit|damn|bitch|ass|crap)\b', text_lower)),
        'caps_ratio': sum(1 for c in text if c.isupper()) / max(len(text), 1),
        'exclamation_count': text.count('!'),
        'question_count': text.count('?'),
        'racial_keywords': len(re.findall(r'\b(black|white|race|racist|racial)\b', text_lower)),
        'threat_keywords': len(re.findall(r'\b(kill|die|death|hurt|violence|attack)\b', text_lower)),
        'has_slurs': bool(re.search(r'\b(nigger|faggot|retard|whore|slut)\b', text_lower)),
        'repeated_chars': len(re.findall(r'(.)\1{2,}', text_lower)),
        'multiple_punctuation': len(re.findall(r'[!?]{2,}', text))
    }
    
    return patterns

In [None]:
def extract_text_features(text):
    """Extrae características numéricas del texto"""
    if pd.isna(text):
        return {}
    
    features = {
        'char_count': len(text),
        'word_count': len(text.split()),
        'sentence_count': len(sent_tokenize(text)),
        'avg_word_length': np.mean([len(word) for word in text.split()]) if text.split() else 0,
        'unique_words': len(set(text.lower().split())),
        'lexical_diversity': len(set(text.lower().split())) / max(len(text.split()), 1)
    }
    
    return features

## Pipeline principal

In [None]:
class TextPreprocessor:
    """Clase principal para preprocesamiento de texto"""
    
    def __init__(self, 
                 apply_stemming=False,
                 apply_lemmatization=True,
                 remove_stopwords=True,
                 min_word_length=2,
                 extract_features=True):
        
        self.apply_stemming = apply_stemming
        self.apply_lemmatization = apply_lemmatization
        self.remove_stopwords = remove_stopwords
        self.min_word_length = min_word_length
        self.extract_features = extract_features
        
    def preprocess_text(self, text):
        """Pipeline completo de preprocesamiento"""
        
        # Limpieza básica
        cleaned_text = apply_basic_cleaning(text)
        
        # Tokenización
        tokens = advanced_tokenize(
            cleaned_text, 
            remove_stopwords=self.remove_stopwords,
            min_word_length=self.min_word_length
        )
        
        # Lematización
        if self.apply_lemmatization and tokens:
            tokens = lemmatize_tokens(tokens)
        
        # Stemming
        if self.apply_stemming and tokens:
            tokens = stem_tokens(tokens)
        
        # Resultado
        result = {
            'original_text': text,
            'cleaned_text': cleaned_text,
            'tokens': tokens,
            'processed_text': ' '.join(tokens) if tokens else ''
        }
        
        # Características adicionales
        if self.extract_features:
            result.update({
                'toxic_patterns': detect_toxic_patterns(text),
                'text_features': extract_text_features(text),
                'spacy_analysis': spacy_process(cleaned_text) if SPACY_AVAILABLE else None
            })
        
        return result
    
    def process_dataframe(self, df, text_column='Text'):
        """Procesa un DataFrame completo"""
        
        print(f"🔄 Procesando {len(df)} comentarios...")
        
        # Aplicar preprocesamiento
        processed_results = df[text_column].progress_apply(self.preprocess_text)
        
        # Extraer resultados en columnas separadas
        df_processed = df.copy()
        df_processed['text_cleaned'] = [result['cleaned_text'] for result in processed_results]
        df_processed['text_tokens'] = [result['tokens'] for result in processed_results]
        df_processed['text_processed'] = [result['processed_text'] for result in processed_results]
        
        # Agregar características de toxicidad
        if self.extract_features:
            toxic_features = pd.DataFrame([result['toxic_patterns'] for result in processed_results])
            text_features = pd.DataFrame([result['text_features'] for result in processed_results])
            
            # Agregar prefijos para evitar conflictos
            toxic_features.columns = ['toxic_' + col for col in toxic_features.columns]
            text_features.columns = ['feature_' + col for col in text_features.columns]
            
            df_processed = pd.concat([df_processed, toxic_features, text_features], axis=1)
        
        print("✅ Preprocesamiento completado!")
        return df_processed

## Aplicación del preprocesamiento

### Carga de datos

In [None]:
df = pd.read_csv('../data/youtoxic_english_1000.csv')

print(f"✅ Dataset cargado: {df.shape[0]} comentarios, {df.shape[1]} columnas")

# Identificar etiquetas válidas (basado en el EDA)
valid_labels = ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 
                'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 
                'IsReligiousHate']

excluded_labels = ['IsSexist', 'IsHomophobic', 'IsRadicalism']

print(f"🎯 Etiquetas válidas para modelado: {len(valid_labels)}")
print(f"⚠️ Etiquetas excluidas: {len(excluded_labels)}")

### Configuración del preprocesador

In [None]:
preprocessing_configs = {
    'basic': {
        'apply_stemming': False,
        'apply_lemmatization': True,
        'remove_stopwords': True,
        'min_word_length': 2,
        'extract_features': True
    },
    
    'aggressive': {
        'apply_stemming': True,
        'apply_lemmatization': True,
        'remove_stopwords': True,
        'min_word_length': 3,
        'extract_features': True
    },
    
    'conservative': {
        'apply_stemming': False,
        'apply_lemmatization': False,
        'remove_stopwords': False,
        'min_word_length': 1,
        'extract_features': True
    }
}

### Aplicar preprocesamiento

In [None]:
def apply_preprocessing_config(df, config_name, config_params):
    """Aplica una configuración específica de preprocesamiento"""
    
    print(f"\n🔄 Aplicando configuración: {config_name.upper()}")
    print("-" * 40)
    
    # Crear preprocessor
    preprocessor = TextPreprocessor(**config_params)
    
    # Procesar dataset
    df_processed = preprocessor.process_dataframe(df, text_column='Text')
    
    # Mostrar estadísticas
    print(f"✅ Procesamiento completado")
    print(f"📊 Columnas agregadas: {df_processed.shape[1] - df.shape[1]}")
    
    # Estadísticas básicas del texto procesado
    df_processed['processed_word_count'] = df_processed['text_processed'].str.split().str.len()
    
    print(f"📈 Estadísticas del texto procesado:")
    print(f"   • Palabras promedio: {df_processed['processed_word_count'].mean():.1f}")
    print(f"   • Mediana de palabras: {df_processed['processed_word_count'].median():.1f}")
    print(f"   • Comentarios vacíos después del procesamiento: {(df_processed['processed_word_count'] == 0).sum()}")
    
    return df_processed

# Aplicar configuración básica (recomendada para el proyecto)
df_processed = apply_preprocessing_config(df, 'basic', preprocessing_configs['basic'])

## Análisis Post-Procesamiento

### Librerías

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import json

### Visualizaciones

In [None]:
# Comparación de longitudes antes y después
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Análisis del Impacto del Preprocesamiento', fontsize=16, fontweight='bold')

# Longitud de caracteres
axes[0,0].hist(df['Text'].str.len(), bins=50, alpha=0.7, label='Original', color='skyblue')
axes[0,0].hist(df_processed['text_cleaned'].str.len(), bins=50, alpha=0.7, label='Limpio', color='orange')
axes[0,0].set_title('Distribución de Longitud (Caracteres)')
axes[0,0].set_xlabel('Número de Caracteres')
axes[0,0].set_ylabel('Frecuencia')
axes[0,0].legend()

# Longitud de palabras
df['original_word_count'] = df['Text'].str.split().str.len()
axes[0,1].hist(df['original_word_count'], bins=30, alpha=0.7, label='Original', color='skyblue')
axes[0,1].hist(df_processed['processed_word_count'], bins=30, alpha=0.7, label='Procesado', color='lightgreen')
axes[0,1].set_title('Distribución de Longitud (Palabras)')
axes[0,1].set_xlabel('Número de Palabras')
axes[0,1].set_ylabel('Frecuencia')
axes[0,1].legend()

# Características de toxicidad por clase
toxic_features = df_processed[df_processed.columns[df_processed.columns.str.startswith('toxic_')]]
mean_features_toxic = toxic_features[df_processed['IsToxic'] == 1].mean()
mean_features_non_toxic = toxic_features[df_processed['IsToxic'] == 0].mean()

x = np.arange(len(mean_features_toxic))
width = 0.35

axes[1,0].bar(x - width/2, mean_features_toxic, width, label='Tóxicos', color='red', alpha=0.7)
axes[1,0].bar(x + width/2, mean_features_non_toxic, width, label='No Tóxicos', color='green', alpha=0.7)
axes[1,0].set_title('Características de Toxicidad por Clase')
axes[1,0].set_xlabel('Características')
axes[1,0].set_ylabel('Valor Promedio')
axes[1,0].set_xticks(x)
axes[1,0].set_xticklabels([col.replace('toxic_', '') for col in toxic_features.columns], rotation=45)
axes[1,0].legend()

# Distribución de características textuales
text_features = df_processed[df_processed.columns[df_processed.columns.str.startswith('feature_')]]
correlation_matrix = text_features.corr()

im = axes[1,1].imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
axes[1,1].set_title('Correlación entre Características Textuales')
axes[1,1].set_xticks(range(len(correlation_matrix.columns)))
axes[1,1].set_yticks(range(len(correlation_matrix.columns)))
axes[1,1].set_xticklabels([col.replace('feature_', '') for col in correlation_matrix.columns], rotation=45)
axes[1,1].set_yticklabels([col.replace('feature_', '') for col in correlation_matrix.columns])

# Agregar colorbar
plt.colorbar(im, ax=axes[1,1])

plt.tight_layout()
plt.show()

### Análisis del vocabulario

In [None]:
# Crear vocabulario completo
all_tokens = []
for tokens in df_processed['text_tokens']:
    if isinstance(tokens, list):
        all_tokens.extend(tokens)

vocab_counter = Counter(all_tokens)
vocab_size = len(vocab_counter)

print(f"✅ Tamaño del vocabulario: {vocab_size:,} palabras únicas")
print(f"✅ Total de tokens: {len(all_tokens):,}")

# Top palabras más comunes
print(f"\n🔝 Top 20 palabras más frecuentes:")
for word, count in vocab_counter.most_common(20):
    print(f"   {word}: {count}")

# Análisis por toxicidad
toxic_tokens = []
non_toxic_tokens = []

for idx, tokens in enumerate(df_processed['text_tokens']):
    if isinstance(tokens, list):
        if df_processed.iloc[idx]['IsToxic'] == 1:
            toxic_tokens.extend(tokens)
        else:
            non_toxic_tokens.extend(tokens)

toxic_vocab = Counter(toxic_tokens)
non_toxic_vocab = Counter(non_toxic_tokens)

print(f"\n🔥 Palabras más comunes en comentarios TÓXICOS:")
for word, count in toxic_vocab.most_common(15):
    print(f"   {word}: {count}")

print(f"\n✅ Palabras más comunes en comentarios NO TÓXICOS:")
for word, count in non_toxic_vocab.most_common(15):
    print(f"   {word}: {count}")

## Preparación para modelado

In [None]:
# Seleccionar características para el modelado
feature_columns = [col for col in df_processed.columns if col.startswith(('toxic_', 'feature_'))]
text_columns = ['text_cleaned', 'text_processed', 'text_tokens']
label_columns = [col for col in valid_labels if col in df_processed.columns]

print(f"✅ Características numéricas: {len(feature_columns)}")
print(f"✅ Columnas de texto: {len(text_columns)}")
print(f"✅ Etiquetas disponibles: {len(label_columns)}")

In [None]:
# Crear diferentes versiones del dataset para experimentación
datasets = {}

# 1. Dataset con características numéricas solamente
datasets['numeric_features'] = {
    'X': df_processed[feature_columns],
    'y': df_processed[label_columns],
    'description': 'Solo características numéricas extraídas'
}

# 2. Dataset con texto procesado para vectorización
datasets['text_processed'] = {
    'X': df_processed['text_processed'],
    'y': df_processed[label_columns],
    'description': 'Texto procesado para TF-IDF/Count Vectorizer'
}

# 3. Dataset con texto limpio para embeddings
datasets['text_cleaned'] = {
    'X': df_processed['text_cleaned'],
    'y': df_processed[label_columns],
    'description': 'Texto limpio para word embeddings'
}

# 4. Dataset combinado (características + texto)
datasets['combined'] = {
    'X_numeric': df_processed[feature_columns],
    'X_text': df_processed['text_processed'],
    'y': df_processed[label_columns],
    'description': 'Características numéricas + texto procesado'
}

In [None]:
# Dividir en train/test para cada configuración
print(f"\n🔄 Dividiendo datasets en train/test (80/20)...")

for name, dataset in datasets.items():
    if name != 'combined':
        X_train, X_test, y_train, y_test = train_test_split(
            dataset['X'], dataset['y'], 
            test_size=0.2, 
            random_state=42, 
            stratify=dataset['y']['IsToxic']
        )
        
        datasets[name].update({
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        })
    else:
        # Para el dataset combinado, dividir por separado
        X_num_train, X_num_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
            dataset['X_numeric'], dataset['X_text'], dataset['y'],
            test_size=0.2,
            random_state=42,
            stratify=dataset['y']['IsToxic']
        )
        
        datasets[name].update({
            'X_numeric_train': X_num_train,
            'X_numeric_test': X_num_test,
            'X_text_train': X_text_train,
            'X_text_test': X_text_test,
            'y_train': y_train,
            'y_test': y_test
        })

print("✅ División completada para todos los datasets")


### Escalado de características numéricas

In [None]:
# Escalar características numéricas
scaler = StandardScaler()

# Para datasets con características numéricas
for name in ['numeric_features', 'combined']:
    if name in datasets:
        if name == 'numeric_features':
            X_train_scaled = scaler.fit_transform(datasets[name]['X_train'])
            X_test_scaled = scaler.transform(datasets[name]['X_test'])
            
            datasets[name]['X_train_scaled'] = pd.DataFrame(
                X_train_scaled, 
                columns=datasets[name]['X_train'].columns,
                index=datasets[name]['X_train'].index
            )
            datasets[name]['X_test_scaled'] = pd.DataFrame(
                X_test_scaled, 
                columns=datasets[name]['X_test'].columns,
                index=datasets[name]['X_test'].index
            )
        
        elif name == 'combined':
            X_num_train_scaled = scaler.fit_transform(datasets[name]['X_numeric_train'])
            X_num_test_scaled = scaler.transform(datasets[name]['X_numeric_test'])
            
            datasets[name]['X_numeric_train_scaled'] = pd.DataFrame(
                X_num_train_scaled,
                columns=datasets[name]['X_numeric_train'].columns,
                index=datasets[name]['X_numeric_train'].index
            )
            datasets[name]['X_numeric_test_scaled'] = pd.DataFrame(
                X_num_test_scaled,
                columns=datasets[name]['X_numeric_test'].columns,
                index=datasets[name]['X_numeric_test'].index
            )

print("✅ Escalado completado")


### Guardado de resultados

In [None]:
# Crear directorio de salida
output_dir = '../processed_data'
os.makedirs(output_dir, exist_ok=True)

# Guardar dataset completo procesado
df_processed.to_csv(f'{output_dir}/dataset_processed_complete.csv', index=False)
print(f"✅ Dataset completo guardado: {output_dir}/dataset_processed_complete.csv")

# Guardar datasets divididos
for name, dataset in datasets.items():
    dataset_dir = f'{output_dir}/{name}'
    os.makedirs(dataset_dir, exist_ok=True)
    
    # Guardar según el tipo de dataset
    if name != 'combined':
        dataset['X_train'].to_csv(f'{dataset_dir}/X_train.csv', index=False)
        dataset['X_test'].to_csv(f'{dataset_dir}/X_test.csv', index=False)
        dataset['y_train'].to_csv(f'{dataset_dir}/y_train.csv', index=False)
        dataset['y_test'].to_csv(f'{dataset_dir}/y_test.csv', index=False)
        
        # Guardar versiones escaladas si existen
        if 'X_train_scaled' in dataset:
            dataset['X_train_scaled'].to_csv(f'{dataset_dir}/X_train_scaled.csv', index=False)
            dataset['X_test_scaled'].to_csv(f'{dataset_dir}/X_test_scaled.csv', index=False)
    
    else:
        # Dataset combinado
        dataset['X_numeric_train'].to_csv(f'{dataset_dir}/X_numeric_train.csv', index=False)
        dataset['X_numeric_test'].to_csv(f'{dataset_dir}/X_numeric_test.csv', index=False)
        dataset['X_text_train'].to_csv(f'{dataset_dir}/X_text_train.csv', index=False)
        dataset['X_text_test'].to_csv(f'{dataset_dir}/X_text_test.csv', index=False)
        dataset['y_train'].to_csv(f'{dataset_dir}/y_train.csv', index=False)
        dataset['y_test'].to_csv(f'{dataset_dir}/y_test.csv', index=False)
        
        # Versiones escaladas
        dataset['X_numeric_train_scaled'].to_csv(f'{dataset_dir}/X_numeric_train_scaled.csv', index=False)
        dataset['X_numeric_test_scaled'].to_csv(f'{dataset_dir}/X_numeric_test_scaled.csv', index=False)

print(f"✅ Todos los datasets guardados en: {output_dir}/")

# Guardar metadatos
metadata = {
    'original_shape': df.shape,
    'processed_shape': df_processed.shape,
    'vocab_size': vocab_size,
    'total_tokens': len(all_tokens),
    'feature_columns': feature_columns,
    'text_columns': text_columns,
    'label_columns': label_columns,
    'valid_labels': valid_labels,
    'excluded_labels': excluded_labels,
    'datasets_info': {name: dataset['description'] for name, dataset in datasets.items()},
    'preprocessing_config': preprocessing_configs['basic']
}

with open(f'{output_dir}/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Metadatos guardados: {output_dir}/metadata.json")

# Guardar el scaler
with open(f'{output_dir}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print(f"✅ Scaler guardado: {output_dir}/scaler.pkl")

# Guardar vocabulario
vocab_info = {
    'vocab_counter': dict(vocab_counter.most_common(1000)),  # Top 1000 palabras
    'toxic_vocab': dict(toxic_vocab.most_common(500)),      # Top 500 tóxicas
    'non_toxic_vocab': dict(non_toxic_vocab.most_common(500))  # Top 500 no tóxicas
}

with open(f'{output_dir}/vocabulary.json', 'w') as f:
    json.dump(vocab_info, f, indent=2)

print(f"✅ Información de vocabulario guardada: {output_dir}/vocabulary.json")