# Ensemble

In [1]:
import pandas as pd 

# Cargar el nuevo archivo CSV limpio proporcionado por el usuario
data = pd.read_csv('../data/cleaned_data.csv')

# Verificar la estructura del archivo para asegurar que las columnas necesarias estén presentes
data.head()

Unnamed: 0,Cleaned_Text,Any_Hate,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,people would take step back make case anyone e...,False,False,False,False,False,False,False,False,False,False,False,False,False
1,law enforcement trained shoot apprehend traine...,True,True,True,False,False,False,False,False,False,False,False,False,False
2,dont reckon life matter banner held white cunt...,True,True,True,False,False,True,False,False,False,False,False,False,False
3,large number people like police officer called...,False,False,False,False,False,False,False,False,False,False,False,False,False
4,arab dude absolutely right shot 6 extra time s...,False,False,False,False,False,False,False,False,False,False,False,False,False


## Voting Classifier (Ensemble de Votación)

In [2]:
import optuna
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Optimización de hiperparámetros del modelo SVM
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    if kernel_svm == 'rbf':
        gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto'])
    else:
        gamma_svm = 'scale'  # predeterminado cuando no se usa 'rbf'

    # Optimización de hiperparámetros para Random Forest
    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2'])

    # Optimización de hiperparámetros para Regresión Logística
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)

    # Configuración de modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    
    # Ajuste de pesos para el Voting Classifier
    weight_svm = trial.suggest_float('weight_svm', 0.1, 1.0)
    weight_rf = trial.suggest_float('weight_rf', 0.1, 1.0)
    weight_logreg = trial.suggest_float('weight_logreg', 0.1, 1.0)
    
    # Crear el ensemble con VotingClassifier
    voting_clf = VotingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg)
    ], voting='soft', weights=[weight_svm, weight_rf, weight_logreg])
    
    # Entrenar y evaluar el ensemble
    voting_clf.fit(x_train, y_train)
    y_pred = voting_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)

# Crear el Voting Classifier final
voting_clf = VotingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg)
], voting='soft', weights=[best_params['weight_svm'], best_params['weight_rf'], best_params['weight_logreg']])

voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:30:24,915] A new study created in memory with name: no-name-116ea02d-0235-44f2-9437-6c8ff4cc4a9a
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:30:25,176] Trial 0 finished with value: 0.7017737272155878 and parameters: {'c_svm': 0.49651769730805356, 'kernel_svm': 'linear', 'n_estimators_rf': 50, 'max_features_rf': 'log2', 'c_logreg': 0.5777810769335261, 'weight_svm': 0.5759355475044479, 'weight_rf': 0.2355441543497304, 'weight_logreg': 0.7981429319088901}. Best is trial 0 with value: 0.7017737272155878.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:30:25,423] Trial 1 finished with value: 0.6901755846514795 and parameters: {'c_svm': 4.1392133440208925, 'kernel_svm': 'linear', 'n_estimators_rf': 20, 'max_features_rf': 'log2', 'c_logreg': 0.21256612174021006, 'weight_svm': 0.6623211848741493, 'weight_rf': 0.35

Mejores parámetros: {'c_svm': 0.542490915984282, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 30, 'max_features_rf': 'sqrt', 'c_logreg': 2.4277841263066318, 'weight_svm': 0.48756306679019423, 'weight_rf': 0.29285584915374074, 'weight_logreg': 0.6929499161084561}
Mejor F1 ponderado: 0.7403122810529476
              precision    recall  f1-score   support

     No Hate       0.70      0.76      0.73        93
        Hate       0.78      0.72      0.75       107

    accuracy                           0.74       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.74      0.74      0.74       200



## Stacking Ensemble (Ensemble de Stacking)

In [3]:
import optuna
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Optimización de hiperparámetros de los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    if kernel_svm == 'rbf':
        gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto'])
    else:
        gamma_svm = 'scale'  # predeterminado para 'linear'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])

    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)

    # Configuración de los modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    
    # Selección del meta-modelo de nivel superior
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'MultinomialNB'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    else:
        alpha_meta = trial.suggest_float('alpha_meta', 0.1, 1.0)
        meta_model = MultinomialNB(alpha=alpha_meta)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
else:
    meta_model = MultinomialNB(alpha=best_params['alpha_meta'])

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:30:38,838] A new study created in memory with name: no-name-8366138f-bea2-4016-95a8-4b013ab440e0
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:30:39,589] Trial 0 finished with value: 0.2951877133105802 and parameters: {'c_svm': 1.2126835263164495, 'kernel_svm': 'linear', 'n_estimators_rf': 20, 'max_features_rf': 0.1, 'c_logreg': 0.9355925645438166, 'meta_model': 'MultinomialNB', 'alpha_meta': 0.6274733815469592}. Best is trial 0 with value: 0.2951877133105802.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:30:40,322] Trial 1 finished with value: 0.2951877133105802 and parameters: {'c_svm': 0.15994035167220538, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 35, 'max_features_rf': 'sqrt', 'c_logreg': 0.47470234701203273, 'meta_model': 'MultinomialNB', 'alpha_meta': 0.32469326066106874}. Best is 

Mejores parámetros: {'c_svm': 0.1064415044885236, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 40, 'max_features_rf': 0.1, 'c_logreg': 0.32870443877334266, 'meta_model': 'LogisticRegression', 'c_meta': 6.667096841789085}
Mejor F1 ponderado: 0.76
              precision    recall  f1-score   support

     No Hate       0.71      0.82      0.76        93
        Hate       0.82      0.71      0.76       107

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.77      0.76      0.76       200



### Mejoras

#### Paso 1: Incluir otros modelos base y probar diferentes meta-modelos

Vamos a añadir Gradient Boosting y K-Nearest Neighbors (KNN) como modelos base. También probaremos con árboles de decisión y Gradient Boosting como meta-modelo.

In [4]:
import optuna
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
x_tfidf = tfidf.fit_transform(data['Cleaned_Text'])
y = data['Any_Hate']

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Función de objetivo para Optuna
def objective(trial):
    # Hiperparámetros para los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
    
    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 3, 15)
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 50, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.01, 0.3)

    # Modelos base
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'DecisionTree', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    elif meta_model_choice == 'DecisionTree':
        max_depth_tree = trial.suggest_int('max_depth_tree', 3, 10)
        meta_model = DecisionTreeClassifier(max_depth=max_depth_tree, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=10)
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.01, 0.3)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)
knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors_knn'])
gb = GradientBoostingClassifier(n_estimators=best_params['n_estimators_gb'], learning_rate=best_params['learning_rate_gb'], random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
elif best_params['meta_model'] == 'DecisionTree':
    meta_model = DecisionTreeClassifier(max_depth=best_params['max_depth_tree'], random_state=42)
else:
    meta_model = GradientBoostingClassifier(n_estimators=best_params['n_estimators_meta_gb'], 
                                            learning_rate=best_params['learning_rate_meta_gb'], random_state=42)

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:31:27,009] A new study created in memory with name: no-name-cc920844-2e81-4e81-82f2-f46acbe49064
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:31:29,232] Trial 0 finished with value: 0.7287974491345278 and parameters: {'c_svm': 0.49823158346769425, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estimators_rf': 15, 'max_features_rf': 'sqrt', 'c_logreg': 0.7149771069955464, 'n_neighbors_knn': 13, 'n_estimators_gb': 120, 'learning_rate_gb': 0.2662985253335246, 'meta_model': 'DecisionTree', 'max_depth_tree': 4}. Best is trial 0 with value: 0.7287974491345278.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
  c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
[I 2024-11-11 17:31:31,448] Trial 1 finished with value: 0.7097679767976797 and parameters: {'c_svm': 8.351781182963848, 'kernel_svm': 'rbf', 'gamma_svm': 'scale', 'n_estim

Mejores parámetros: {'c_svm': 0.36847252928964835, 'kernel_svm': 'linear', 'n_estimators_rf': 15, 'max_features_rf': 0.5, 'c_logreg': 2.846469548328978, 'n_neighbors_knn': 12, 'n_estimators_gb': 50, 'learning_rate_gb': 0.04018631242007435, 'meta_model': 'DecisionTree', 'max_depth_tree': 3}
Mejor F1 ponderado: 0.7443888594565958
              precision    recall  f1-score   support

     No Hate       0.74      0.70      0.72        93
        Hate       0.75      0.79      0.77       107

    accuracy                           0.74       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.74      0.74      0.74       200



#### Paso 2: Ajustar el Número de Folds en la Validación Cruzada (cv)

In [5]:
# Función de objetivo para Optuna (ajustando 'cv' en el Stacking Ensemble)
def objective(trial):
    # Hiperparámetros para los modelos base
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 50, step=5)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.2, 0.5])
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
    
    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 3, 15)
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 50, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.01, 0.3)

    # Modelos base
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'DecisionTree', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 10)
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    elif meta_model_choice == 'DecisionTree':
        max_depth_tree = trial.suggest_int('max_depth_tree', 3, 10)
        meta_model = DecisionTreeClassifier(max_depth=max_depth_tree, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=10)
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.01, 0.3)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Optimización de 'cv'
    cv_folds = trial.suggest_categorical('cv_folds', [3, 5, 10])

    # Configuración del Stacking Ensemble con el número de folds optimizado
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=cv_folds)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados, incluyendo 'cv_folds'
cv_folds = study.best_params['cv_folds']
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=cv_folds)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:33:19,115] A new study created in memory with name: no-name-c757df0d-466d-495d-8ef9-96016a09e630
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:33:26,246] Trial 0 finished with value: 0.6193157262905161 and parameters: {'c_svm': 2.376349762258961, 'kernel_svm': 'linear', 'n_estimators_rf': 10, 'max_features_rf': 'log2', 'c_logreg': 1.1441389144563758, 'n_neighbors_knn': 5, 'n_estimators_gb': 130, 'learning_rate_gb': 0.10800643262951973, 'meta_model': 'DecisionTree', 'max_depth_tree': 7, 'cv_folds': 10}. Best is trial 0 with value: 0.6193157262905161.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:33:29,589] Trial 1 finished with value: 0.6942751719824891 and parameters: {'c_svm': 1.0450040571707873, 'kernel_svm': 'linear', 'n_estimators_rf': 10, 'max_features_rf': 'log2', 'c_logreg': 4.331716297189893, 'n_ne

Mejores parámetros: {'c_svm': 0.32291704553291, 'kernel_svm': 'rbf', 'gamma_svm': 'auto', 'n_estimators_rf': 40, 'max_features_rf': 0.2, 'c_logreg': 0.20202752320235276, 'n_neighbors_knn': 9, 'n_estimators_gb': 90, 'learning_rate_gb': 0.04021280406410663, 'meta_model': 'LogisticRegression', 'c_meta': 0.8057400606836033, 'cv_folds': 3}
Mejor F1 ponderado: 0.7550796269906748
              precision    recall  f1-score   support

     No Hate       0.74      0.70      0.72        93
        Hate       0.75      0.79      0.77       107

    accuracy                           0.74       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.74      0.74      0.74       200



#### Paso 3: Ajustar Hiperparámetros de Modelos Individuales con Mayor Precisión

In [6]:
# Función de objetivo para Optuna con ajuste fino de hiperparámetros
def objective(trial):
    # Hiperparámetros para los modelos base, con ajustes más precisos
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = trial.suggest_categorical('gamma_svm', ['scale', 'auto']) if kernel_svm == 'rbf' else 'scale'

    n_estimators_rf = trial.suggest_int('n_estimators_rf', 10, 30, step=2)  # Reducción de rango y paso
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2', 0.1, 0.15, 0.2])

    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística

    n_neighbors_knn = trial.suggest_int('n_neighbors_knn', 8, 15)  # Enfocado en el rango óptimo
    n_estimators_gb = trial.suggest_int('n_estimators_gb', 100, 150, step=10)
    learning_rate_gb = trial.suggest_float('learning_rate_gb', 0.05, 0.15, step=0.01)  # Ajuste más fino

    # Configuración de los modelos base con los hiperparámetros ajustados
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=50, min_samples_leaf=3, 
                                max_features=max_features_rf, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors_knn)
    gb = GradientBoostingClassifier(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)
    
    # Meta-modelo
    meta_model_choice = trial.suggest_categorical('meta_model', ['LogisticRegression', 'GradientBoosting'])
    if meta_model_choice == 'LogisticRegression':
        c_meta = trial.suggest_loguniform('c_meta', 0.1, 2)  # Intervalo más específico para C
        meta_model = LogisticRegression(C=c_meta, max_iter=1000, random_state=42)
    else:
        n_estimators_meta_gb = trial.suggest_int('n_estimators_meta_gb', 50, 100, step=5)  # Ajuste fino en el meta-modelo
        learning_rate_meta_gb = trial.suggest_float('learning_rate_meta_gb', 0.05, 0.15, step=0.01)
        meta_model = GradientBoostingClassifier(n_estimators=n_estimators_meta_gb, learning_rate=learning_rate_meta_gb, random_state=42)

    # Configuración del Stacking Ensemble
    stacking_clf = StackingClassifier(estimators=[
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg),
        ('knn', knn),
        ('gb', gb)
    ], final_estimator=meta_model, cv=3)

    # Entrenar y evaluar el ensemble
    stacking_clf.fit(x_train, y_train)
    y_pred = stacking_clf.predict(x_test)
    score = f1_score(y_test, y_pred, average='weighted')
    return score

# Crear el estudio de Optuna y optimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Resultados de Optuna
print("Mejores parámetros:", study.best_params)
print("Mejor F1 ponderado:", study.best_value)

# Entrenamiento final con los mejores parámetros encontrados
best_params = study.best_params
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_depth=50, min_samples_leaf=3, 
                            max_features=best_params['max_features_rf'], random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)
knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors_knn'])
gb = GradientBoostingClassifier(n_estimators=best_params['n_estimators_gb'], learning_rate=best_params['learning_rate_gb'], random_state=42)

# Configuración del meta-modelo de nivel superior basado en los mejores parámetros encontrados
if best_params['meta_model'] == 'LogisticRegression':
    meta_model = LogisticRegression(C=best_params['c_meta'], max_iter=1000, random_state=42)
else:
    meta_model = GradientBoostingClassifier(n_estimators=best_params['n_estimators_meta_gb'], 
                                            learning_rate=best_params['learning_rate_meta_gb'], random_state=42)

# Crear y entrenar el modelo Stacking final
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg),
    ('knn', knn),
    ('gb', gb)
], final_estimator=meta_model, cv=3)

stacking_clf.fit(x_train, y_train)
y_pred = stacking_clf.predict(x_test)

# Informe de clasificación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:36:45,873] A new study created in memory with name: no-name-4f2ab053-e2c4-4e3b-8ccb-a24844be7a9a
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística
[I 2024-11-11 17:36:48,840] Trial 0 finished with value: 0.74 and parameters: {'c_svm': 1.2882848567251346, 'kernel_svm': 'linear', 'n_estimators_rf': 10, 'max_features_rf': 0.2, 'c_logreg': 0.1933127377748801, 'n_neighbors_knn': 8, 'n_estimators_gb': 140, 'learning_rate_gb': 0.05, 'meta_model': 'GradientBoosting', 'n_estimators_meta_gb': 100, 'learning_rate_meta_gb': 0.15}. Best is trial 0 with value: 0.74.
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 2)  # Intervalo más pequeño
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 2)  # Menor rango para C en Regresión Logística
[I 2024-11-11 17:36:51,445] Trial 1 finished with value: 0.6942751719824891 and parameters: {'c_svm': 0.8523442437

Mejores parámetros: {'c_svm': 1.4572160504209182, 'kernel_svm': 'linear', 'n_estimators_rf': 14, 'max_features_rf': 0.15, 'c_logreg': 1.1676242694983106, 'n_neighbors_knn': 9, 'n_estimators_gb': 120, 'learning_rate_gb': 0.09, 'meta_model': 'GradientBoosting', 'n_estimators_meta_gb': 70, 'learning_rate_meta_gb': 0.12000000000000001}
Mejor F1 ponderado: 0.7602400960384154
              precision    recall  f1-score   support

     No Hate       0.72      0.80      0.76        93
        Hate       0.80      0.73      0.76       107

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200



#### Opción con MultinomialNB

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import optuna

# Definir la función objetivo de Optuna para optimizar el ensemble
def objective(trial):
    # Hiperparámetros para cada modelo
    alpha_nb = trial.suggest_loguniform('alpha_nb', 1e-3, 1e1)  # Ajuste de suavización para MultinomialNB
    c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
    kernel_svm = trial.suggest_categorical('kernel_svm', ['linear', 'rbf'])
    gamma_svm = 'auto' if kernel_svm == 'rbf' else 'scale'
    n_estimators_rf = trial.suggest_int('n_estimators_rf', 50, 150, step=10)
    max_features_rf = trial.suggest_categorical('max_features_rf', ['sqrt', 'log2'])
    c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)

    # Instancias de modelos con hiperparámetros ajustados
    nb = MultinomialNB(alpha=alpha_nb)
    svm = SVC(C=c_svm, kernel=kernel_svm, gamma=gamma_svm, probability=True)
    rf = RandomForestClassifier(n_estimators=n_estimators_rf, max_features=max_features_rf, max_depth=20, random_state=42)
    logreg = LogisticRegression(C=c_logreg, max_iter=1000, random_state=42)
    
    # Ajuste de pesos en el ensemble
    weight_nb = trial.suggest_float('weight_nb', 0.1, 1.0)
    weight_svm = trial.suggest_float('weight_svm', 0.1, 1.0)
    weight_rf = trial.suggest_float('weight_rf', 0.1, 1.0)
    weight_logreg = trial.suggest_float('weight_logreg', 0.1, 1.0)
    
    # Creación del ensemble con VotingClassifier
    voting_clf = VotingClassifier(estimators=[
        ('nb', nb),
        ('svm', svm),
        ('rf', rf),
        ('logreg', logreg)],
        voting='soft', weights=[weight_nb, weight_svm, weight_rf, weight_logreg])
    
    # Validación cruzada para evaluar el desempeño del ensemble
    score = cross_val_score(voting_clf, x_train, y_train, cv=5, scoring='accuracy').mean()
    return score

# Optimización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Aumentar el número de pruebas según el tiempo disponible

# Entrenar el ensemble final con los mejores hiperparámetros encontrados
best_params = study.best_params
print("Mejores parámetros:", best_params)

# Instanciar modelos con los mejores parámetros para entrenar el modelo final
nb = MultinomialNB(alpha=best_params['alpha_nb'])
svm = SVC(C=best_params['c_svm'], kernel=best_params['kernel_svm'], gamma=best_params.get('gamma_svm', 'scale'), probability=True)
rf = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'], max_features=best_params['max_features_rf'], max_depth=20, random_state=42)
logreg = LogisticRegression(C=best_params['c_logreg'], max_iter=1000, random_state=42)

# VotingClassifier con los mejores pesos
voting_clf = VotingClassifier(estimators=[
    ('nb', nb),
    ('svm', svm),
    ('rf', rf),
    ('logreg', logreg)],
    voting='soft', weights=[best_params['weight_nb'], best_params['weight_svm'], best_params['weight_rf'], best_params['weight_logreg']])

# Entrenar el ensemble final
voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)

# Evaluación final
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['No Hate', 'Hate']))


[I 2024-11-11 17:38:52,293] A new study created in memory with name: no-name-a626ab5f-aea3-4171-9b2c-0fdaf330b91b
  alpha_nb = trial.suggest_loguniform('alpha_nb', 1e-3, 1e1)  # Ajuste de suavización para MultinomialNB
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:38:54,198] Trial 0 finished with value: 0.7025 and parameters: {'alpha_nb': 0.6928793155311488, 'c_svm': 1.4924911135631513, 'kernel_svm': 'linear', 'n_estimators_rf': 150, 'max_features_rf': 'sqrt', 'c_logreg': 0.15859029642422995, 'weight_nb': 0.16951312465195062, 'weight_svm': 0.9922788373585463, 'weight_rf': 0.2692705226772004, 'weight_logreg': 0.2225877779889596}. Best is trial 0 with value: 0.7025.
  alpha_nb = trial.suggest_loguniform('alpha_nb', 1e-3, 1e1)  # Ajuste de suavización para MultinomialNB
  c_svm = trial.suggest_loguniform('c_svm', 0.1, 10)
  c_logreg = trial.suggest_loguniform('c_logreg', 0.1, 10)
[I 2024-11-11 17:38:55,471]

Mejores parámetros: {'alpha_nb': 8.724775067692846, 'c_svm': 1.3447370609913554, 'kernel_svm': 'rbf', 'n_estimators_rf': 110, 'max_features_rf': 'sqrt', 'c_logreg': 7.769671437939925, 'weight_nb': 0.1310214747761662, 'weight_svm': 0.24383860327824347, 'weight_rf': 0.5204970572817637, 'weight_logreg': 0.41946865670001593}
              precision    recall  f1-score   support

     No Hate       0.65      0.83      0.73        93
        Hate       0.80      0.61      0.69       107

    accuracy                           0.71       200
   macro avg       0.72      0.72      0.71       200
weighted avg       0.73      0.71      0.71       200



## Guardar el modelo en Joblib

In [8]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Guardar el modelo Stacking Ensemble entrenado
joblib.dump(stacking_clf, "../models/stacking_ensemble_model.joblib")

# Configurar y ajustar el vectorizador TF-IDF con los datos de entrenamiento
tfidf_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 3))
X_tfidf = tfidf_vectorizer.fit_transform(data['Cleaned_Text'])

# Guardar el vectorizador
joblib.dump(tfidf_vectorizer, "../models/tfidf_vectorizer.joblib")


['../models/tfidf_vectorizer.joblib']