# spaCy Text Classification - Reducci√≥n de Overfitting

## Objetivo
Probar spaCy TextCategorizer como alternativa a modelos cl√°sicos para reducir overfitting manteniendo F1-score > 0.55.

## Ventajas de spaCy Text Classification
- ‚úÖ‚úÖ‚úÖ Word embeddings pre-entrenados (mejor sem√°ntica que TF-IDF)
- ‚úÖ‚úÖ‚úÖ Modelo espec√≠fico para clasificaci√≥n de texto
- ‚úÖ‚úÖ‚úÖ Fine-tuning con tu dataset
- ‚úÖ‚úÖ‚úÖ Mejor control de overfitting que modelos cl√°sicos
- ‚úÖ‚úÖ‚úÖ M√°s r√°pido que Transformers
- ‚úÖ Ya tienes spaCy instalado


## 1. Importaci√≥n de librer√≠as


In [15]:
import pandas as pd
import numpy as np
import pickle
import random
from pathlib import Path

import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
random.seed(42)

print("‚úÖ Librer√≠as importadas")


‚úÖ Librer√≠as importadas


## 2. Cargar modelo de spaCy


In [16]:
# Intentar cargar modelo con word vectors (md o lg)
# Si no est√° disponible, usar sm y descargar md
try:
    nlp = spacy.load('en_core_web_md')
    print("‚úÖ Modelo en_core_web_md cargado (con word vectors)")
except OSError:
    try:
        nlp = spacy.load('en_core_web_lg')
        print("‚úÖ Modelo en_core_web_lg cargado (con word vectors)")
    except OSError:
        print("‚ö†Ô∏è  Modelos con word vectors no encontrados")
        print("   Usando en_core_web_sm (sin word vectors)")
        print("   Para mejor rendimiento, instala: python -m spacy download en_core_web_md")
        nlp = spacy.load('en_core_web_sm')


‚ö†Ô∏è  Modelos con word vectors no encontrados
   Usando en_core_web_sm (sin word vectors)
   Para mejor rendimiento, instala: python -m spacy download en_core_web_md


## 3. Carga de datos


In [17]:
# Cargar datos
df = pd.read_csv('../data/processed/youtoxic_english_1000_processed.csv')
with open('../data/processed/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('../data/processed/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

X_train_text = df[df.index.isin(range(len(y_train)))]['Text_processed'].values
X_test_text = df[df.index.isin(range(len(y_train), len(y_train) + len(y_test)))]['Text_processed'].values

print(f"‚úÖ Datos cargados: {len(X_train_text)} train, {len(X_test_text)} test")
print(f"Distribuci√≥n train: {np.bincount(y_train)}")
print(f"Distribuci√≥n test: {np.bincount(y_test)}")


‚úÖ Datos cargados: 800 train, 200 test
Distribuci√≥n train: [430 370]
Distribuci√≥n test: [108  92]


## 4. Preparar datos en formato spaCy


In [18]:
# Preparar datos en formato spaCy
# spaCy espera: (texto, {"cats": {"TOXIC": 1.0, "NOT_TOXIC": 0.0}})

def prepare_spacy_data(texts, labels):
    """Prepara datos en formato spaCy para TextCategorizer."""
    data = []
    for text, label in zip(texts, labels):
        # Convertir label a categor√≠as
        cats = {
            "TOXIC": 1.0 if label == 1 else 0.0,
            "NOT_TOXIC": 1.0 if label == 0 else 0.0
        }
        data.append((text, {"cats": cats}))
    return data

# Preparar datos de entrenamiento y prueba
train_data = prepare_spacy_data(X_train_text, y_train)
test_data = prepare_spacy_data(X_test_text, y_test)

print(f"‚úÖ Datos preparados: {len(train_data)} train, {len(test_data)} test")
print(f"Ejemplo de formato:")
print(f"  Texto: {train_data[0][0][:50]}...")
print(f"  Categor√≠as: {train_data[0][1]}")


‚úÖ Datos preparados: 800 train, 200 test
Ejemplo de formato:
  Texto: people would take step back make case be not anyon...
  Categor√≠as: {'cats': {'TOXIC': 1.0, 'NOT_TOXIC': 0.0}}


## 5. Crear y configurar TextCategorizer


In [19]:
# Crear pipeline de spaCy con TextCategorizer
if "textcat" not in nlp.pipe_names:
    # A√±adir TextCategorizer al pipeline
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

# A√±adir etiquetas
textcat.add_label("TOXIC")
textcat.add_label("NOT_TOXIC")

print("‚úÖ TextCategorizer configurado")
print(f"Etiquetas: {textcat.labels}")


‚úÖ TextCategorizer configurado
Etiquetas: ('TOXIC', 'NOT_TOXIC')


## 6. Funci√≥n de evaluaci√≥n


In [20]:
def evaluate_spacy_model(nlp_model, texts, true_labels):
    """Eval√∫a modelo de spaCy y retorna m√©tricas."""
    predictions = []
    
    # Procesar textos y obtener predicciones
    for text in texts:
        doc = nlp_model(text)
        # Obtener probabilidad de clase TOXIC
        toxic_score = doc.cats.get("TOXIC", 0.0)
        # Predicci√≥n: 1 si TOXIC > 0.5, 0 en caso contrario
        pred = 1 if toxic_score > 0.5 else 0
        predictions.append(pred)
    
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    
    # Calcular m√©tricas
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, zero_division=0)
    recall = recall_score(true_labels, predictions, zero_division=0)
    f1 = f1_score(true_labels, predictions, zero_division=0)
    cm = confusion_matrix(true_labels, predictions)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'predictions': predictions
    }

print("‚úÖ Funci√≥n de evaluaci√≥n definida")


‚úÖ Funci√≥n de evaluaci√≥n definida


## 7. Entrenar modelo


In [None]:
# Convertir datos a formato Example de spaCy
train_examples = [Example.from_dict(nlp.make_doc(text), annots) for text, annots in train_data]

# Deshabilitar otros componentes del pipeline durante entrenamiento
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

print("="*80)
print("ENTRENAMIENTO SPACY TEXTCATEGORIZER")
print("="*80)
print(f"Componentes deshabilitados: {other_pipes}")
print(f"√âpocas: 10")
print(f"Batch size: compuesto (4-32)")
print("-"*80)

# Entrenar modelo
n_iter = 10
with nlp.disable_pipes(*other_pipes):
    # Inicializar el componente
    # Si hay problemas con lookups, intentar sin ellos
    try:
        nlp.initialize(lambda: train_examples)
    except ValueError as e:
        if "lookups" in str(e).lower() or "lookup" in str(e).lower():
            print("‚ö†Ô∏è  Advertencia: Problemas con lookups, inicializando sin ellos...")
            try:
                nlp.initialize(lambda: train_examples, exclude=["lookups"])
            except:
                # Si a√∫n falla, usar m√©todo alternativo
                print("‚ö†Ô∏è  Usando m√©todo alternativo de inicializaci√≥n...")
                nlp.initialize(get_examples=lambda: train_examples)
        else:
            raise
    
    # Entrenar
    for epoch in range(n_iter):
        # Mezclar datos
        random.shuffle(train_examples)
        
        # Crear batches de tama√±o variable
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
        
        losses = {}
        for batch in batches:
            nlp.update(batch, losses=losses, drop=0.2)  # drop=0.2 para regularizaci√≥n
        
        if epoch % 2 == 0:
            print(f"√âpoca {epoch+1}/{n_iter} - Loss: {losses.get('textcat', 0.0):.4f}")

print("\n‚úÖ Entrenamiento completado")


ENTRENAMIENTO SPACY TEXTCATEGORIZER
Componentes deshabilitados: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
√âpocas: 10
Batch size: compuesto (4-32)
--------------------------------------------------------------------------------
‚ö†Ô∏è  Advertencia: Problemas con lookups, inicializando sin ellos...
‚ö†Ô∏è  Usando m√©todo alternativo de inicializaci√≥n...


ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

## 8. Evaluaci√≥n del modelo


In [None]:
# Evaluar en train y test
print("Evaluando en conjunto de entrenamiento...")
train_results = evaluate_spacy_model(nlp, X_train_text, y_train)

print("\nEvaluando en conjunto de prueba...")
test_results = evaluate_spacy_model(nlp, X_test_text, y_test)

# Calcular diferencia de F1 (overfitting)
diff_f1 = abs(train_results['f1'] - test_results['f1']) * 100

print("\n" + "="*80)
print("RESULTADOS FINALES - SPACY TEXTCATEGORIZER")
print("="*80)
print(f"F1-score (train): {train_results['f1']:.4f}")
print(f"F1-score (test): {test_results['f1']:.4f}")
print(f"Accuracy (test): {test_results['accuracy']:.4f}")
print(f"Precision (test): {test_results['precision']:.4f}")
print(f"Recall (test): {test_results['recall']:.4f}")
print(f"Diferencia F1: {diff_f1:.2f}%")
print(f"\nMatriz de confusi√≥n (test):")
print(test_results['confusion_matrix'])

if diff_f1 < 5.0 and test_results['f1'] > 0.55:
    print("\n‚úÖ‚úÖ‚úÖ OBJETIVO CUMPLIDO: Overfitting < 5% Y F1 > 0.55")
elif diff_f1 < 6.0:
    print("\nüéØ MUY CERCA: Overfitting < 6%")
else:
    print("\n‚ö†Ô∏è  Overfitting a√∫n alto")

print("="*80)


## 9. Validaci√≥n Cruzada (opcional)


In [None]:
# Validaci√≥n cruzada (toma tiempo, opcional)
print("Validaci√≥n cruzada (5-fold)...")
print("‚ö†Ô∏è  Esto puede tardar varios minutos...")

X_all_text = np.concatenate([X_train_text, X_test_text])
y_all = np.concatenate([y_train, y_test])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_all_text, y_all), 1):
    print(f"Fold {fold}/5...")
    
    # Preparar datos del fold
    fold_train_data = prepare_spacy_data(X_all_text[train_idx], y_all[train_idx])
    fold_val_data = prepare_spacy_data(X_all_text[val_idx], y_all[val_idx])
    
    # Crear nuevo modelo para este fold
    fold_nlp = spacy.load('en_core_web_md' if 'md' in str(nlp.meta.get('name', '')) else 'en_core_web_sm')
    if "textcat" not in fold_nlp.pipe_names:
        fold_textcat = fold_nlp.add_pipe("textcat", last=True)
    else:
        fold_textcat = fold_nlp.get_pipe("textcat")
    fold_textcat.add_label("TOXIC")
    fold_textcat.add_label("NOT_TOXIC")
    
    # Entrenar
    fold_examples = [Example.from_dict(fold_nlp.make_doc(text), annots) for text, annots in fold_train_data]
    fold_other_pipes = [pipe for pipe in fold_nlp.pipe_names if pipe != "textcat"]
    
    with fold_nlp.disable_pipes(*fold_other_pipes):
        fold_nlp.initialize(lambda: fold_examples)
        for epoch in range(5):  # Menos √©pocas para CV
            random.shuffle(fold_examples)
            batches = minibatch(fold_examples, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                fold_nlp.update(batch, losses={}, drop=0.2)
    
    # Evaluar
    fold_results = evaluate_spacy_model(fold_nlp, X_all_text[val_idx], y_all[val_idx])
    cv_scores.append(fold_results['f1'])
    print(f"  F1-score: {fold_results['f1']:.4f}")

print(f"\nF1-score (CV): {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
print(f"Scores: {cv_scores}")


## 10. Guardar modelo (si cumple objetivos)


In [None]:
if diff_f1 < 6.0 and test_results['f1'] > 0.55:
    # Guardar modelo
    output_dir = Path('../models/spacy_textcat_model')
    output_dir.mkdir(parents=True, exist_ok=True)
    nlp.to_disk(output_dir)
    
    # Guardar informaci√≥n del modelo
    model_info = {
        'model_type': 'spaCy TextCategorizer',
        'spacy_model': str(nlp.meta.get('name', 'unknown')),
        'test_f1': test_results['f1'],
        'diff_f1': diff_f1,
        'n_iter': n_iter,
        'has_word_vectors': hasattr(nlp.vocab.vectors, 'shape')
    }
    
    with open('../models/spacy_textcat_info.pkl', 'wb') as f:
        pickle.dump(model_info, f)
    
    print(f"‚úÖ Modelo guardado en: {output_dir}")
    print(f"‚úÖ Informaci√≥n guardada en: ../models/spacy_textcat_info.pkl")
else:
    print("‚ö†Ô∏è  Modelo no guardado (no cumple objetivos)")
