# Ensemble

In [None]:
import pandas as pd 

# Cargar el nuevo archivo CSV limpio proporcionado por el usuario
data = pd.read_csv('../data/cleaned_data.csv')

# Verificar la estructura del archivo para asegurar que las columnas necesarias estén presentes
data.head()

## Voting Classifier (Ensemble de Votación)

In [12]:
import optuna
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Optimización de hiperparámetros del modelo SVM
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    if kernel_svm == 'rbf':
        gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto'])
    else:
        gamma_svm = 'scale'  # predeterminado cuando no se usa 'rbf'

    # Optimización de hiperparámetros para Random Forest
    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2'])

    # Optimización de hiperparámetros para Regresión Logística
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)

    # Configuración de modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    
    # Ajuste de pesos para el Voting Classifier
    weight_svm = trial.suggest_float('weight_svm', 0.1, 1.0)
    weight_rf = trial.suggest_float('weight_rf', 0.1, 1.0)
    weight_logreg = trial.suggest_float('weight_logreg', 0.1, 1.0)
    
    # Crear el ensemble con VotingClassifier
    voting_clf = VotingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg)
    ], voting='soft', weights=[weight_svm, weight_rf, weight_logreg])
    
    # Entrenar y evaluar el ensemble
    voting_clf.fit(x_train, y_train)
    y_pred = voting_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)

# Crear el Voting Classifier final
voting_clf = VotingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg)
], voting='soft', weights=[best_params['weight_svm'], best_params['weight_rf'], best_params['weight_logreg']])

voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 12:43:28,264] A new study created in memory with name: no-name-62c8323a-cc60-41c6-9876-4d7e84f08d66
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:43:28,552] Trial 0 finished with value: 0.7243464665415885 and parameters: {'c_svm': 9.026090126357168, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 35, 'max_features_rf': 'log2', 'c_logreg': 0.6500514380049922, 'weight_svm': 0.7363288580751521, 'weight_rf': 0.11494272041419797, 'weight_logreg': 0.356921423820045}. Best is trial 0 with value: 0.7243464665415885.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:43:28,805] Trial 1 finished with value: 0.7082556390977445 and parameters: {'c_svm': 0.5131880915048176, 'kernel_svm': 'linear', 'n_estimators_rf': 25, 'max_features_rf': 'sqrt', 'c_logreg': 2.785657175831617, 'weight_svm': 0.1917726558474897, 'w

Mejores parámetros: {'c_svm': 7.328152860547816, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 45, 'max_features_rf': 'log2', 'c_logreg': 6.051484933751665, 'weight_svm': 0.7765501253111379, 'weight_rf': 0.4905698690258572, 'weight_logreg': 0.6962888444702259}
Mejor F1 ponderado: 0.7401560156015602
              precision    recall  f1-score   support

     No Hate       0.70      0.78      0.74        93
        Hate       0.79      0.70      0.74       107

    accuracy                           0.74       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.75      0.74      0.74       200



## Stacking Ensemble (Ensemble de Stacking)

In [13]:
import optuna
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Optimización de hiperparámetros de los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    if kernel_svm == 'rbf':
        gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto'])
    else:
        gamma_svm = 'scale'  # predeterminado para 'linear'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])

    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)

    # Configuración de los modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    
    # Selección del meta-modelo de nivel superior
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'MultinomialNB'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    else:
        alpha_meta = trial.suggest_float('alpha_meta', 0.1, 1.0)
        meta_model = MultinomialNB(alpha=alpha_meta)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
else:
    meta_model = MultinomialNB(alpha=best_params['alpha_meta'])

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 12:45:45,811] A new study created in memory with name: no-name-66a415ba-6446-4274-8714-bd11f8a078d1
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
  c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
[I 2024-11-11 12:45:48,142] Trial 0 finished with value: 0.7350861271531788 and parameters: {'c_svm': 0.16214652489397177, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 50, 'max_features_rf': 0.5, 'c_logreg': 1.9586773117831595, 'meta_model': 'LogisticRegression', 'c_meta': 1.767576425623267}. Best is trial 0 with value: 0.7350861271531788.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:45:50,482] Trial 1 finished with value: 0.2951877133105802 and parameters: {'c_svm': 1.2795131444763468, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 50, 'max_features_rf': 0.5, 'c_logreg': 3.280000779373526, 'meta

Mejores parámetros: {'c_svm': 0.941194598635538, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 40, 'max_features_rf': 0.1, 'c_logreg': 0.37407896236938104, 'meta_model': 'LogisticRegression', 'c_meta': 9.604660037751028}
Mejor F1 ponderado: 0.76
              precision    recall  f1-score   support

     No Hate       0.71      0.82      0.76        93
        Hate       0.82      0.71      0.76       107

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.77      0.76      0.76       200



### Mejoras

#### Paso 1: Incluir otros modelos base y probar diferentes meta-modelos

Vamos a añadir Gradient Boosting y K-Nearest Neighbors (KNN) como modelos base. También probaremos con árboles de decisión y Gradient Boosting como meta-modelo.

In [14]:
import optuna
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Hiperparámetros para los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
    
    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 3, 15)
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 50, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.01, 0.3)

    # Modelos base
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'DecisionTree', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    elif meta_model_choice == 'DecisionTree':
        max_depth_tree = trial.suggest_int('max_depth_tree', 3, 10)
        meta_model = DecisionTreeClassifier(max_depth=max_depth_tree, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=10)
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.01, 0.3)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)
knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors_knn'])
gb = GradientBoostingClassifier(n_estimators=best_params['n_estimators_gb'], learning_rate=best_params['learning_rate_gb'], random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
elif best_params['meta_model'] == 'DecisionTree':
    meta_model = DecisionTreeClassifier(max_depth=best_params['max_depth_tree'], random_state=42)
else:
    meta_model = GradientBoostingClassifier(n_estimators=best_params['n_estimators_meta_gb'], 
                                            learning_rate=best_params['learning_rate_meta_gb'], random_state=42)

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 12:53:55,013] A new study created in memory with name: no-name-a39d296e-2d5b-4076-ba6e-dfcaa88ada82
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:53:57,804] Trial 0 finished with value: 0.7148931223280581 and parameters: {'c_svm': 2.0962597989741663, 'kernel_svm': 'linear', 'n_estimators_rf': 30, 'max_features_rf': 0.2, 'c_logreg': 0.10716860030902038, 'n_neighbors_knn': 12, 'n_estimators_gb': 120, 'learning_rate_gb': 0.19310425214930746, 'meta_model': 'GradientBoosting', 'n_estimators_meta_gb': 80, 'learning_rate_meta_gb': 0.1175346922536124}. Best is trial 0 with value: 0.7148931223280581.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:53:59,274] Trial 1 finished with value: 0.7253095684803003 and parameters: {'c_svm': 0.6148980885670258, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 10, 'max_

Mejores parámetros: {'c_svm': 0.5457208593104554, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 10, 'max_features_rf': 0.1, 'c_logreg': 0.5785406766227151, 'n_neighbors_knn': 14, 'n_estimators_gb': 50, 'learning_rate_gb': 0.1550541450990992, 'meta_model': 'LogisticRegression', 'c_meta': 0.906887609884815}
Mejor F1 ponderado: 0.7497999799979997
              precision    recall  f1-score   support

     No Hate       0.70      0.82      0.75        93
        Hate       0.81      0.69      0.75       107

    accuracy                           0.75       200
   macro avg       0.76      0.75      0.75       200
weighted avg       0.76      0.75      0.75       200



#### Paso 2: Ajustar el Número de Folds en la Validación Cruzada (cv)

In [15]:
# Función de objetivo para Optuna (ajustando 'cv' en el Stacking Ensemble)
def objective(trial):
    # Hiperparámetros para los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
    
    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 3, 15)
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 50, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.01, 0.3)

    # Modelos base
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'DecisionTree', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    elif meta_model_choice == 'DecisionTree':
        max_depth_tree = trial.suggest_int('max_depth_tree', 3, 10)
        meta_model = DecisionTreeClassifier(max_depth=max_depth_tree, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=10)
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.01, 0.3)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Optimización de 'cv'
    cv_folds = trial.suggest_categorical('cv_folds', [3, 5, 10])

    # Configuración del Stacking Ensemble con el número de folds optimizado
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=cv_folds)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados, incluyendo 'cv_folds'
cv_folds = study.best_params['cv_folds']
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=cv_folds)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 12:57:04,811] A new study created in memory with name: no-name-6efcdbb4-c7f0-4de2-a960-7964bcfc06b8
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
  c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
[I 2024-11-11 12:57:14,656] Trial 0 finished with value: 0.7139513403919802 and parameters: {'c_svm': 0.3432449105180878, 'kernel_svm': 'linear', 'n_estimators_rf': 45, 'max_features_rf': 0.2, 'c_logreg': 0.21422254025247825, 'n_neighbors_knn': 9, 'n_estimators_gb': 140, 'learning_rate_gb': 0.10203604533824857, 'meta_model': 'LogisticRegression', 'c_meta': 6.101487964040783, 'cv_folds': 10}. Best is trial 0 with value: 0.7139513403919802.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 12:57:20,318] Trial 1 finished with value: 0.7346620489610163 and parameters: {'c_svm': 2.1006056566127107, 'kernel_svm': 'rbf', 'gamma_svm': 'scale',

Mejores parámetros: {'c_svm': 0.3142767956309614, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 15, 'max_features_rf': 0.1, 'c_logreg': 0.2013724951566474, 'n_neighbors_knn': 11, 'n_estimators_gb': 120, 'learning_rate_gb': 0.06429830868764891, 'meta_model': 'LogisticRegression', 'c_meta': 6.528035287677666, 'cv_folds': 3}
Mejor F1 ponderado: 0.7550796269906748
              precision    recall  f1-score   support

     No Hate       0.70      0.82      0.75        93
        Hate       0.81      0.69      0.75       107

    accuracy                           0.75       200
   macro avg       0.76      0.75      0.75       200
weighted avg       0.76      0.75      0.75       200



#### Paso 3: Ajustar Hiperparámetros de Modelos Individuales con Mayor Precisión

In [16]:
# Función de objetivo para Optuna con ajuste fino de hiperparámetros
def objective(trial):
    # Hiperparámetros para los modelos base, con ajustes más precisos
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 30, step=2)  # Reducción de rango y paso
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.15, 0.2])

    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística

    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 8, 15)  # Enfocado en el rango óptimo
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 100, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.05, 0.15, step=0.01)  # Ajuste más fino

    # Configuración de los modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 2)  # Intervalo más específico para C
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=5)  # Ajuste fino en el meta-modelo
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.05, 0.15, step=0.01)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)
knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors_knn'])
gb = GradientBoostingClassifier(n_estimators=best_params['n_estimators_gb'], learning_rate=best_params['learning_rate_gb'], random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
else:
    meta_model = GradientBoostingClassifier(n_estimators=best_params['n_estimators_meta_gb'], 
                                            learning_rate=best_params['learning_rate_meta_gb'], random_state=42)

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 13:04:25,216] A new study created in memory with name: no-name-0cc9e941-0d8e-4f91-9f5c-0a334d8bc383
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística
  c_meta = trial.suggest_loguniform('c_meta', 0.1, 2)  # Intervalo más específico para C
[I 2024-11-11 13:04:27,716] Trial 0 finished with value: 0.7094777911164465 and parameters: {'c_svm': 0.21394789923669946, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 22, 'max_features_rf': 'sqrt', 'c_logreg': 1.2437012919303754, 'n_neighbors_knn': 10, 'n_estimators_gb': 130, 'learning_rate_gb': 0.09, 'meta_model': 'LogisticRegression', 'c_meta': 0.6815951316572245}. Best is trial 0 with value: 0.7094777911164465.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística
[

Mejores parámetros: {'c_svm': 0.5761340270403612, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 20, 'max_features_rf': 0.15, 'c_logreg': 1.0021571087489134, 'n_neighbors_knn': 12, 'n_estimators_gb': 120, 'learning_rate_gb': 0.11, 'meta_model': 'LogisticRegression', 'c_meta': 1.9935498609455058}
Mejor F1 ponderado: 0.7501500150015002
              precision    recall  f1-score   support

     No Hate       0.70      0.80      0.75        93
        Hate       0.80      0.71      0.75       107

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.76      0.75      0.75       200



Stacking Ensemble optimizado:

+ Mejores parámetros:

    SVM: C=0.58, kernel='rbf', gamma='scale'

    Random Forest: n_estimators=20, max_features=0.15

    Regresión Logística (modelo base): C=1.00

    KNN: n_neighbors=12

    Gradient Boosting (modelo base): n_estimators=120, learning_rate=0.11

    Meta-modelo: Regresión Logística con C=2.00

Rendimiento del modelo optimizado:

+ Clase "No Hate":
    Precisión: 0.70
    Recall: 0.80
    F1-score: 0.75
+ Clase "Hate":
    Precisión: 0.80
    Recall: 0.71
    F1-score: 0.75
+ Exactitud General: 0.75
+ F1 ponderado: 0.750

Conclusión

    Este ajuste fino de los hiperparámetros ha logrado un rendimiento consistente con el F1 ponderado de 0.750 y una exactitud general del 75%. La optimización precisa de los parámetros clave ha mejorado la capacidad del ensemble para mantener un buen equilibrio entre precisión y recall en ambas clases, maximizando el rendimiento en este conjunto de datos.

## Guardar el modelo en Joblib

In [21]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Guardar el modelo Stacking Ensemble entrenado
joblib.dump(stacking_clf, "../models/stacking_ensemble_model.joblib")

# Configurar y ajustar el vectorizador TF-IDF con los datos de entrenamiento
tfidf_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
X_tfidf = tfidf_vectorizer.fit_transform(data['Cleaned_Text'])

# Guardar el vectorizador
joblib.dump(tfidf_vectorizer, "../models/tfidf_vectorizer.joblib")


['../models/tfidf_vectorizer.joblib']