1. Carga de datos y librerias

In [43]:
import pandas as pd
import re
import emoji
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from langdetect import detect

In [44]:
# Cargar el dataset
df = pd.read_csv("../../data/dataset_eda.csv", encoding="utf-8")

2. Limpieza de texto

In [45]:
print("\n" + "=" * 60)
print("üßº LIMPIEZA DE TEXTO")
print("=" * 60)

def clean_text(text):
    """
    Limpieza de texto enfocada en hate speech detection
    
    Proceso:
    1. Min√∫sculas
    2. Emojis a texto en INGL√âS (üòÇ -> face with tears of joy)
    3. Eliminar URLs y menciones
    4. Mantener hashtags como palabras
    5. Mantener solo letras y espacios (espa√±ol + ingl√©s)
    6. Reducir repeticiones de letras
    7. Normalizar espacios
    """
    # Convertir a string y min√∫sculas
    text = str(text).lower()
    
    # Codificaci√≥n UTF-8 segura
    text = text.encode("utf-8", "ignore").decode("utf-8")
    
    # Convertir emojis a texto EN INGL√âS
    # üòÇ -> :face_with_tears_of_joy:
    text = emoji.demojize(text, language='en')
    
    # Eliminar URLs
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    
    # Eliminar menciones (@usuario)
    text = re.sub(r"@\w+", " ", text)
    
    # Convertir hashtags a palabras (#libertad -> libertad)
    text = re.sub(r"#(\w+)", r"\1", text)
    
    # OPCI√ìN 2: Mantener SOLO letras y espacios
    # Esto convierte :face_with_tears_of_joy: -> face with tears of joy
    # Mantiene espa√±ol (√°√©√≠√≥√∫√±√º) + ingl√©s (a-z)
    text = re.sub(r"[^a-z√°√©√≠√≥√∫√±√º\s]", " ", text)
    
    # Reducir letras repetidas (hooola -> hola, soooo -> so)
    text = re.sub(r"(.)\1{2,}", r"\1", text)
    
    # Normalizar espacios m√∫ltiples y quitar espacios al inicio/fin
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Aplicar limpieza
print("üîÑ Limpiando textos...")
df["text_clean"] = df["Text"].apply(clean_text)

# Verificar que la limpieza funcion√≥
print(f"‚úÖ {len(df)} textos procesados correctamente")

# ==========================================
# MOSTRAR EJEMPLOS DE TRANSFORMACI√ìN
# ==========================================
print("\n" + "=" * 60)
print("üìã EJEMPLOS DE TRANSFORMACI√ìN")
print("=" * 60)

for i in range(5):
    print(f"\n[Ejemplo {i+1}]")
    original = df['Text'].iloc[i]
    limpio = df['text_clean'].iloc[i]
    
    # Mostrar hasta 120 caracteres
    print(f"Original ({len(original)} chars): {original[:120]}")
    print(f"Limpio   ({len(limpio)} chars): {limpio[:120]}")
    
    # Mostrar diferencia de longitud
    original_words = len(original.split())
    clean_words = len(limpio.split())
    print(f"Palabras: {original_words} -> {clean_words} ({clean_words-original_words:+d})")

# ==========================================
# DETECTAR Y ELIMINAR TEXTOS VAC√çOS
# ==========================================
print("\n" + "=" * 60)
print("üóëÔ∏è DETECCI√ìN DE TEXTOS VAC√çOS")
print("=" * 60)

empty_after_clean = df[df['text_clean'].str.strip() == '']
print(f"‚ö†Ô∏è Textos vac√≠os tras limpieza: {len(empty_after_clean)}")

if len(empty_after_clean) > 0:
    print("\nEjemplos de textos que quedaron vac√≠os:")
    for idx, original in empty_after_clean['Text'].head(5).items():
        print(f"  - '{original[:80]}'")
    
    print(f"\nüóëÔ∏è Eliminando {len(empty_after_clean)} textos vac√≠os...")
    df = df[df['text_clean'].str.strip() != ''].reset_index(drop=True)
    print(f"‚úÖ Dataset final: {len(df)} filas")
else:
    print("‚úÖ No hay textos vac√≠os")

# ==========================================
# RECALCULAR LONGITUD DE TEXTOS LIMPIOS
# ==========================================
df['length_clean'] = df['text_clean'].str.split().str.len()

print("\n" + "=" * 60)
print("üìä ESTAD√çSTICAS DE TEXTOS LIMPIOS")
print("=" * 60)
print(df['length_clean'].describe())

# ==========================================
# AN√ÅLISIS DE TEXTOS MUY CORTOS
# ==========================================
print("\n" + "=" * 60)
print("‚ö†Ô∏è AN√ÅLISIS DE TEXTOS CORTOS")
print("=" * 60)

very_short = df[df['length_clean'] <= 2]
print(f"Textos con ‚â§2 palabras: {len(very_short)} ({len(very_short)/len(df)*100:.1f}%)")

if len(very_short) > 0:
    print("\nEjemplos de textos muy cortos:")
    for i, (original, clean) in enumerate(zip(very_short['Text'].head(5), 
                                               very_short['text_clean'].head(5)), 1):
        print(f"{i}. Original: '{original[:60]}'")
        print(f"   Limpio  : '{clean}'")

# ==========================================
# COMPARACI√ìN ORIGINAL VS LIMPIO
# ==========================================
print("\n" + "=" * 60)
print("üìè COMPARACI√ìN: ORIGINAL VS LIMPIO")
print("=" * 60)

# Calcular longitud original
df['length_original'] = df['Text'].str.split().str.len()

print(f"Longitud promedio ORIGINAL: {df['length_original'].mean():.2f} palabras")
print(f"Longitud promedio LIMPIO  : {df['length_clean'].mean():.2f} palabras")

reduccion = ((df['length_original'].mean() - df['length_clean'].mean()) / df['length_original'].mean()) * 100
print(f"Reducci√≥n promedio        : {reduccion:.1f}%")

print(f"\nLongitud m√°xima ORIGINAL  : {df['length_original'].max()} palabras")
print(f"Longitud m√°xima LIMPIA    : {df['length_clean'].max()} palabras")

print(f"\nLongitud m√≠nima ORIGINAL  : {df['length_original'].min()} palabras")
print(f"Longitud m√≠nima LIMPIA    : {df['length_clean'].min()} palabras")

# ==========================================
# VERIFICAR CONVERSI√ìN DE EMOJIS
# ==========================================
print("\n" + "=" * 60)
print("üòÄ VERIFICACI√ìN DE CONVERSI√ìN DE EMOJIS")
print("=" * 60)

# Buscar textos que originalmente ten√≠an emojis
textos_con_emoji = df[df['Text'].str.contains(r'[^\w\s,.]', regex=True, na=False)]
print(f"Textos con posibles emojis: {len(textos_con_emoji)}")

if len(textos_con_emoji) > 0:
    print("\nEjemplos de conversi√≥n de emojis:")
    for i, (original, clean) in enumerate(zip(textos_con_emoji['Text'].head(3), 
                                               textos_con_emoji['text_clean'].head(3)), 1):
        print(f"\n{i}. Original: {original[:100]}")
        print(f"   Limpio  : {clean[:100]}")

# ==========================================
# AN√ÅLISIS DE VOCABULARIO
# ==========================================
print("\n" + "=" * 60)
print("üìö AN√ÅLISIS DE VOCABULARIO")
print("=" * 60)

# Contar palabras √∫nicas
all_words = ' '.join(df['text_clean']).split()
unique_words = set(all_words)

print(f"Total de palabras (con repeticiones): {len(all_words):,}")
print(f"Palabras √∫nicas (vocabulario)       : {len(unique_words):,}")
print(f"Ratio de diversidad                 : {len(unique_words)/len(all_words):.4f}")

# Palabras m√°s frecuentes
from collections import Counter
word_freq = Counter(all_words)
print("\nüîù Top 20 palabras m√°s frecuentes:")
for i, (word, count) in enumerate(word_freq.most_common(20), 1):
    print(f"{i:2}. {word:15} -> {count:6,} veces ({count/len(all_words)*100:.2f}%)")

# ==========================================
# RESUMEN FINAL
# ==========================================
print("\n" + "=" * 60)
print("‚úÖ LIMPIEZA COMPLETADA")
print("=" * 60)
print(f"üìä Dataset final        : {len(df):,} comentarios")
print(f"üìù Vocabulario          : {len(unique_words):,} palabras √∫nicas")
print(f"üìè Longitud promedio    : {df['length_clean'].mean():.2f} palabras")
print(f"üìâ Reducci√≥n de tama√±o  : {reduccion:.1f}%")
print(f"\nüöÄ Datos listos para vectorizaci√≥n y modelado NLP")


üßº LIMPIEZA DE TEXTO
üîÑ Limpiando textos...
‚úÖ 997 textos procesados correctamente

üìã EJEMPLOS DE TRANSFORMACI√ìN

[Ejemplo 1]
Original (1558 chars): If only people would just take a step back and not make this case about them, because it wasn't about anyone except the 
Limpio   (1518 chars): if only people would just take a step back and not make this case about them because it wasn t about anyone except the t
Palabras: 287 -> 288 (+1)

[Ejemplo 2]
Original (138 chars): Law enforcement is not trained to shoot to apprehend. ¬†They are trained to shoot to kill. ¬†And I thank Wilson for killin
Limpio   (133 chars): law enforcement is not trained to shoot to apprehend they are trained to shoot to kill and i thank wilson for killing th
Palabras: 25 -> 25 (+0)

[Ejemplo 3]
Original (420 chars): 
Dont you reckon them 'black lives matter' banners being held by white cunts is ¬†kinda patronizing and ironically racist
Limpio   (406 chars): dont you reckon them black lives matter banner

3. Normalizaci√≥n biling√ºe con spaCy


In [46]:
# Cargar modelos spaCy
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

def normalize_text_spacy(text):
    """
    Normalizaci√≥n biling√ºe con spaCy:
    1. Detectar idioma
    2. Tokenizar
    3. Lematizar
    4. Eliminar stopwords
    5. Mantener solo tokens alfab√©ticos
    """
    try:
        lang = detect(text)
    except:
        lang = "en"  # fallback si no detecta

    doc = nlp_es(text) if lang == "es" else nlp_en(text)

    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop
    ]
    return " ".join(tokens)

# Aplicar normalizaci√≥n sobre tu columna ya limpia
print("üîÑ Normalizando textos biling√ºes con spaCy...")
df["text_norm"] = df["text_clean"].apply(normalize_text_spacy)

# Ejemplos
for i in range(5):
    print(f"\n[Ejemplo {i+1}]")
    print("Limpio     :", df["text_clean"].iloc[i])
    print("Normalizado:", df["text_norm"].iloc[i])



üîÑ Normalizando textos biling√ºes con spaCy...

[Ejemplo 1]
Limpio     : if only people would just take a step back and not make this case about them because it wasn t about anyone except the two people in that situation to lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation the guy in this video is heavily emotional and hyped up and wants to be heard and when he gets heard he just presses more and more he was never out to have a reasonable discussion kudos to the smerconish for keeping level the whole time and letting masri make himself out to be a fool how dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate by the way since when did police brutality become an epidemic i wish everyone would just stop pretending like they were there and they knew exactly what was going on because there s no measurable a

5. Guardado del Preprocessing

In [52]:
# Guardar el dataset normalizado
preprocessing_file = "../../data/preprocessing.py"
output_file = "../../data/comentarios_preprocesados.pkl"

df.to_pickle(output_file)

print(f"‚úÖ Dataset guardado despu√©s del Preprocesamiento en: {output_file}")

‚úÖ Dataset guardado despu√©s del Preprocesamiento en: ../../data/comentarios_preprocesados.pkl
