# Modelos Avanzados y Ajuste de Hiperparámetros

Este notebook implementa modelos avanzados con técnicas de regularización,
embeddings y deep learning para mejorar la robustez de las predicciones
y controlar el overfitting.

Objetivos:
- Ajuste de hiperparámetros con validación cruzada
- Implementación de embeddings pre-entrenados
- Modelos de deep learning con regularización
- Control de overfitting (diferencia train-test < 5%)
- Ensemble methods para mayor robustez
- Validación exhaustiva y early stopping


## Librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Librerías adicionales para modelos avanzados
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    StratifiedKFold, validation_curve, learning_curve
)
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, f1_score, accuracy_score
)
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    VotingClassifier, BaggingClassifier
)
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Deep Learning y Embeddings
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
try:
    import gensim.downloader as api
except ImportError:
    import sys
    !{sys.executable} -m pip install --upgrade gensim
    import gensim.downloader as api

# Utilidades
import pickle
import json
import time
from datetime import datetime
from collections import defaultdict
import joblib

print("📚 Librerías avanzadas cargadas exitosamente")


## Pipeline principal para un modelado robusto

In [None]:
class AdvancedModelPipeline:
    """Pipeline avanzado para modelos robustos con control de overfitting"""

    def __init__(self, overfitting_threshold=0.05, random_state=42):
        self.overfitting_threshold = overfitting_threshold
        self.random_state = random_state
        self.models = {}
        self.best_params = {}
        self.results = {}
        self.embeddings_cache = {}

    def setup_hyperparameter_grids(self):
        """Configurar grids de hiperparámetros para optimización"""
        return {
            'logistic_regression': {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2', 'elasticnet'],
                'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                'max_iter': [1000, 2000],
                'class_weight': ['balanced', None]
            },
            'random_forest': {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [3, 5, 7, 10, None],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'max_features': ['sqrt', 'log2', None],
                'class_weight': ['balanced', None]
            },
            'gradient_boosting': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'subsample': [0.8, 0.9, 1.0]
            },
            'svm': {
                'C': [0.1, 1, 10, 100],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
                'class_weight': ['balanced', None]
            },
            'mlp': {
                'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
                'activation': ['relu', 'tanh'],
                'alpha': [0.0001, 0.001, 0.01, 0.1],
                'learning_rate': ['constant', 'adaptive'],
                'max_iter': [500, 1000],
                'early_stopping': [True],
                'validation_fraction': [0.1, 0.2]
            }
        }

    def hyperparameter_tuning(self, X_train, y_train, model_name='logistic_regression',
                            cv_folds=5, n_iter=50, use_randomized=True):
        """Ajuste de hiperparámetros con validación cruzada"""
        print(f"📚 Ajustando hiperparámetros para {model_name}...")

        grids = self.setup_hyperparameter_grids()
        if model_name not in grids:
            print(f"Grid no definido para {model_name}")
            return None

        # Seleccionar modelo base
        base_models = {
            'logistic_regression': LogisticRegression(random_state=self.random_state, solver='saga'),
            'random_forest': RandomForestClassifier(random_state=self.random_state),
            'gradient_boosting': GradientBoostingClassifier(random_state=self.random_state),
            'svm': SVC(random_state=self.random_state, probability=True),
            'mlp': MLPClassifier(random_state=self.random_state)
        }

        model = base_models[model_name]
        param_grid = grids[model_name]

        # Configurar búsqueda
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)

        if use_randomized:
            search = RandomizedSearchCV(
                model, param_grid, n_iter=n_iter, cv=cv,
                scoring='f1', n_jobs=-1, random_state=self.random_state,
                return_train_score=True
            )
        else:
            search = GridSearchCV(
                model, param_grid, cv=cv, scoring='f1',
                n_jobs=-1, return_train_score=True
            )

        # Ejecutar búsqueda
        start_time = time.time()
        search.fit(X_train, y_train)
        tuning_time = time.time() - start_time

        # Guardar resultados
        self.best_params[model_name] = search.best_params_

        # Verificar overfitting
        train_score = search.cv_results_['mean_train_score'][search.best_index_]
        val_score = search.cv_results_['mean_test_score'][search.best_index_]
        overfitting_gap = train_score - val_score

        results = {
            'best_model': search.best_estimator_,
            'best_params': search.best_params_,
            'best_score': search.best_score_,
            'train_score': train_score,
            'val_score': val_score,
            'overfitting_gap': overfitting_gap,
            'tuning_time': tuning_time,
            'cv_results': search.cv_results_
        }

        print(f"  ✅ Mejor score: {search.best_score_:.4f}")
        print(f"  📊 Gap overfitting: {overfitting_gap:.4f}")
        print(f"  ⏱️ Tiempo: {tuning_time:.2f}s")

        if overfitting_gap > self.overfitting_threshold:
            print(f"  ⚠️ Posible overfitting detectado (gap > {self.overfitting_threshold})")

        return results

## Extractor de embaddings

In [None]:
class EmbeddingExtractor:
    """Extractor de embeddings pre-entrenados"""

    def __init__(self):
        self.word2vec_model = None
        self.bert_tokenizer = None
        self.bert_model = None

    def load_word2vec(self, model_name='word2vec-google-news-300'):
        """Cargar Word2Vec pre-entrenado"""
        print(f"📚 Cargando Word2Vec: {model_name}...")
        try:
            self.word2vec_model = api.load(model_name)
            print("✅ Word2Vec cargado exitosamente")
            return True
        except Exception as e:
            print(f"❌ Error cargando Word2Vec: {e}")
            return False

    def load_bert(self, model_name='bert-base-uncased'):
        """Cargar BERT pre-entrenado"""
        print(f"📚 Cargando BERT: {model_name}...")
        try:
            self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.bert_model = AutoModel.from_pretrained(model_name)
            print("✅ BERT cargado exitosamente")
            return True
        except Exception as e:
            print(f"❌ Error cargando BERT: {e}")
            return False

    def get_word2vec_embeddings(self, texts, vector_size=300):
        """Extraer embeddings Word2Vec"""
        if self.word2vec_model is None:
            print("❌ Word2Vec no está cargado")
            return None

        embeddings = []
        for text in texts:
            words = text.split()
            word_vectors = []

            for word in words:
                if word in self.word2vec_model:
                    word_vectors.append(self.word2vec_model[word])

            if word_vectors:
                # Promedio de vectores de palabras
                text_embedding = np.mean(word_vectors, axis=0)
            else:
                # Vector cero si no hay palabras conocidas
                text_embedding = np.zeros(vector_size)

            embeddings.append(text_embedding)

        return np.array(embeddings)

    def get_bert_embeddings(self, texts, max_length=128):
        """Extraer embeddings BERT"""
        if self.bert_model is None or self.bert_tokenizer is None:
            print("❌ BERT no está cargado")
            return None

        embeddings = []
        self.bert_model.eval()

        with torch.no_grad():
            for text in texts:
                # Tokenizar
                inputs = self.bert_tokenizer(
                    text, return_tensors='pt',
                    max_length=max_length, truncation=True, padding=True
                )

                # Obtener embeddings
                outputs = self.bert_model(**inputs)
                # Usar el token [CLS] como representación de la oración
                embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
                embeddings.append(embedding)

        return np.array(embeddings)

## Clasificador de Deep Learning con regularización

In [None]:
class DeepLearningClassifier:
    """Clasificador de Deep Learning con regularización"""

    def __init__(self, input_size, hidden_sizes=[128, 64], dropout=0.3,
                 learning_rate=0.001, weight_decay=0.01):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.model = None
        self.history = {}

    def build_model(self):
        """Construir red neuronal con regularización"""
        layers = []
        prev_size = self.input_size

        for hidden_size in self.hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Dropout(self.dropout)
            ])
            prev_size = hidden_size

        # Capa de salida
        layers.append(nn.Linear(prev_size, 1))
        layers.append(nn.Sigmoid())

        self.model = nn.Sequential(*layers)
        return self.model

    def train(self, X_train, y_train, X_val=None, y_val=None,
              epochs=100, batch_size=32, early_stopping_patience=10):
        """Entrenar modelo con early stopping"""

        if self.model is None:
            self.build_model()

        # Preparar datos
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Configurar optimizador y pérdida
        optimizer = optim.Adam(self.model.parameters(),
                             lr=self.learning_rate, weight_decay=self.weight_decay)
        criterion = nn.BCELoss()

        # Variables para early stopping
        best_val_loss = float('inf')
        patience_counter = 0
        best_model_state = None

        # Historia de entrenamiento
        self.history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

        print(f"📚 Iniciando entrenamiento por {epochs} épocas...")

        for epoch in range(epochs):
            # Entrenamiento
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                predicted = (outputs > 0.5).float()
                train_total += batch_y.size(0)
                train_correct += (predicted == batch_y).sum().item()

            # Métricas de entrenamiento
            avg_train_loss = train_loss / len(train_loader)
            train_accuracy = train_correct / train_total

            self.history['train_loss'].append(avg_train_loss)
            self.history['train_acc'].append(train_accuracy)

            # Validación
            val_loss, val_accuracy = 0, 0
            if X_val is not None and y_val is not None:
                self.model.eval()
                with torch.no_grad():
                    X_val_tensor = torch.FloatTensor(X_val)
                    y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1)

                    val_outputs = self.model(X_val_tensor)
                    val_loss = criterion(val_outputs, y_val_tensor).item()

                    val_predicted = (val_outputs > 0.5).float()
                    val_accuracy = (val_predicted == y_val_tensor).sum().item() / len(y_val)

                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_accuracy)

                # Early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                    best_model_state = self.model.state_dict().copy()
                else:
                    patience_counter += 1

                if patience_counter >= early_stopping_patience:
                    print(f"📚 Early stopping en época {epoch+1}")
                    break

            # Logging cada 10 épocas
            if (epoch + 1) % 10 == 0:
                print(f"Época {epoch+1}/{epochs} - "
                      f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
                if X_val is not None:
                    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

        # Restaurar mejor modelo
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)

        print("✅ Entrenamiento completado")

    def predict(self, X):
        """Realizar predicciones"""
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X)
            outputs = self.model(X_tensor)
            return outputs.numpy().flatten()

    def predict_proba(self, X):
        """Obtener probabilidades"""
        return self.predict(X)

    def plot_training_history(self):
        """Visualizar historia de entrenamiento"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

        # Pérdida
        ax1.plot(self.history['train_loss'], label='Train Loss')
        if self.history['val_loss']:
            ax1.plot(self.history['val_loss'], label='Validation Loss')
        ax1.set_title('Pérdida durante el entrenamiento')
        ax1.set_xlabel('Época')
        ax1.set_ylabel('Pérdida')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Precisión
        ax2.plot(self.history['train_acc'], label='Train Accuracy')
        if self.history['val_acc']:
            ax2.plot(self.history['val_acc'], label='Validation Accuracy')
        ax2.set_title('Precisión durante el entrenamiento')
        ax2.set_xlabel('Época')
        ax2.set_ylabel('Precisión')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

## Constructor de modelos ensemble

In [None]:
class EnsembleBuilder:
    """Constructor de modelos ensemble"""

    def __init__(self, base_models=None):
        self.base_models = base_models or {}
        self.ensemble_models = {}

    def create_voting_ensemble(self, models_dict, voting='hard'):
        """Crear ensemble por votación"""
        estimators = [(name, model) for name, model in models_dict.items()]
        ensemble = VotingClassifier(estimators=estimators, voting=voting)
        return ensemble

    def create_bagging_ensemble(self, base_model, n_estimators=10):
        """Crear ensemble por bagging"""
        ensemble = BaggingClassifier(
            base_estimator=base_model,
            n_estimators=n_estimators,
            random_state=42
        )
        return ensemble

    def create_stacking_ensemble(self, base_models, meta_model=None):
        """Crear ensemble por stacking (implementación simple)"""
        if meta_model is None:
            meta_model = LogisticRegression()

        # Esta es una implementación simplificada
        # En producción se usaría StackingClassifier de sklearn
        return VotingClassifier(
            estimators=[(name, model) for name, model in base_models.items()],
            voting='soft'
        )

In [None]:
class OverfittingAnalyzer:
    """Analizador de overfitting y robustez"""

    def __init__(self, threshold=0.05):
        self.threshold = threshold

    def analyze_learning_curves(self, model, X, y, cv=5):
        """Analizar curvas de aprendizaje"""
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y, cv=cv, n_jobs=-1,
            train_sizes=np.linspace(0.1, 1.0, 10),
            random_state=42
        )

        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)

        # Visualizar
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, 'o-', label='Training Score')
        plt.fill_between(train_sizes, train_mean - train_std,
                        train_mean + train_std, alpha=0.1)

        plt.plot(train_sizes, val_mean, 'o-', label='Validation Score')
        plt.fill_between(train_sizes, val_mean - val_std,
                        val_mean + val_std, alpha=0.1)

        plt.xlabel('Training Set Size')
        plt.ylabel('Score')
        plt.title('Learning Curves')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

        # Calcular gap final
        final_gap = train_mean[-1] - val_mean[-1]

        return {
            'train_sizes': train_sizes,
            'train_scores': {'mean': train_mean, 'std': train_std},
            'val_scores': {'mean': val_mean, 'std': val_std},
            'final_gap': final_gap,
            'overfitting_detected': final_gap > self.threshold
        }

    def analyze_validation_curves(self, model, X, y, param_name, param_range):
        """Analizar curvas de validación para un hiperparámetro"""
        train_scores, val_scores = validation_curve(
            model, X, y, param_name=param_name, param_range=param_range,
            cv=5, scoring='f1', n_jobs=-1
        )

        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)

        # Visualizar
        plt.figure(figsize=(10, 6))
        plt.semilogx(param_range, train_mean, 'o-', label='Training Score')
        plt.fill_between(param_range, train_mean - train_std,
                        train_mean + train_std, alpha=0.1)

        plt.semilogx(param_range, val_mean, 'o-', label='Validation Score')
        plt.fill_between(param_range, val_mean - val_std,
                        val_mean + val_std, alpha=0.1)

        plt.xlabel(param_name)
        plt.ylabel('Score')
        plt.title(f'Validation Curves - {param_name}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

        # Encontrar valor óptimo
        optimal_idx = np.argmax(val_mean)
        optimal_value = param_range[optimal_idx]
        optimal_gap = train_mean[optimal_idx] - val_mean[optimal_idx]

        return {
            'param_range': param_range,
            'train_scores': {'mean': train_mean, 'std': train_std},
            'val_scores': {'mean': val_mean, 'std': val_std},
            'optimal_value': optimal_value,
            'optimal_gap': optimal_gap
        }

## Implementación del pipeline a los datos preprocesados

In [None]:
# 📥 Cargar datos reales del preprocesamiento
print("📚 Cargando datos preprocesados reales...")

df_X_train_text = pd.read_csv('text_cleaned/X_train.csv').squeeze()
df_X_test_text = pd.read_csv('text_cleaned/X_test.csv').squeeze()

df_y_train = pd.read_csv('text_cleaned/y_train.csv')['IsToxic']
df_y_test = pd.read_csv('text_cleaned/y_test.csv')['IsToxic']

df_X_train_scaled = pd.read_csv('numeric_features/X_train_scaled.csv')
df_X_test_scaled = pd.read_csv('numeric_features/X_test_scaled.csv')


In [None]:
# 🏃 Ejecutar pipeline completo

def run_advanced_pipeline():
    print("🚀 INICIANDO PIPELINE AVANZADO")
    print("=" * 50)

    # 1. Inicializar componentes
    pipeline = AdvancedModelPipeline(overfitting_threshold=0.05)
    embedder = EmbeddingExtractor()
    analyzer = OverfittingAnalyzer(threshold=0.05)

    # 2. Confirmación de datos cargados
    print("\n📚 Datos cargados correctamente")

    # 3. Ajuste de hiperparámetros
    print("\n📚 FASE 1: AJUSTE DE HIPERPARÁMETROS")
    print("-" * 40)
    tuned_results = {}
    for model_name in ['logistic_regression', 'random_forest']:
        result = pipeline.hyperparameter_tuning(
            df_X_train_scaled, df_y_train, model_name=model_name, n_iter=10
        )
        tuned_results[model_name] = result

    # 4. Embeddings
    print("\n📚 FASE 2: EXTRACCIÓN DE EMBEDDINGS")
    print("-" * 40)
    if embedder.load_word2vec():
        X_train_w2v = embedder.get_word2vec_embeddings(df_X_train_text)
        X_test_w2v = embedder.get_word2vec_embeddings(df_X_test_text)

    # 5. Deep Learning
    print("\n📚 FASE 3: DEEP LEARNING")
    print("-" * 40)
    dl_classifier = DeepLearningClassifier(input_size=df_X_train_scaled.shape[1])
    dl_classifier.train(df_X_train_scaled.values, df_y_train.values, df_X_test_scaled.values, df_y_test.values,epochs=20)
    dl_classifier.plot_training_history()

    # 6. Ensemble
    print("\n📚 FASE 4: MODELOS ENSEMBLE")
    print("-" * 40)
    from sklearn.ensemble import VotingClassifier
    best_models = {k: v['best_model'] for k, v in tuned_results.items()}
    ensemble_builder = EnsembleBuilder()
    voting_ensemble = ensemble_builder.create_voting_ensemble(best_models)
    voting_ensemble.fit(df_X_train_scaled, df_y_train)
    y_pred = voting_ensemble.predict(df_X_test_scaled)
    f1 = f1_score(df_y_test, y_pred)
    print(f"\n✅ F1-Score Ensemble: {f1:.4f}")

    # 7. Análisis de overfitting
    print("\n📚 FASE 5: ANÁLISIS DE OVERFITTING")
    print("-" * 40)
    for name, model in best_models.items():
        print(f"\n🔍 Analizando curvas de aprendizaje para: {name}")
        curves = analyzer.analyze_learning_curves(model, df_X_train_scaled, df_y_train)
        print(f"Gap de overfitting ({name}): {curves['final_gap']:.4f}")

    print("\n🎉 Pipeline completo ejecutado con éxito")

if __name__ == "__main__":
    run_advanced_pipeline()
