# Balanceo de dataset combinando SMOTE + Tomek

In [1]:
import pandas as pd

df = pd.read_csv('../data/TA1C_dataset_detection_train_cleaned.csv')

In [2]:
# Convertir etiquetas de texto a valores numéricos
label_mapping = {'Clickbait': 1, 'No': 0}
df['Tag Value'] = df['Tag Value'].map(label_mapping)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Probar diferentes configuraciones de n-gramas y representaciones
configurations = [
    {"ngram_range": (1, 1), "representation": "tfidf"},  # Unigramas con TF-IDF
    {"ngram_range": (1, 2), "representation": "tfidf"},  # Unigramas + Bigramas con TF-IDF
    {"ngram_range": (1, 3), "representation": "tfidf"},  # Unigramas + Bigramas + Trigramas con TF-IDF
    {"ngram_range": (1, 2), "representation": "binary"},  # Unigramas + Bigramas con representación binaria
    {"ngram_range": (1, 2), "representation": "frequency"},  # Unigramas + Bigramas con frecuencia
]

for config in configurations:
    print(f"Configuración: ngram_range={config['ngram_range']}, representación={config['representation']}")
    
    # Configurar el vectorizador
    if config["representation"] == "tfidf":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"])
    elif config["representation"] == "binary":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"], binary=True)
    elif config["representation"] == "frequency":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"], use_idf=False)
    
    # Crear representaciones para el conjunto de entrenamiento y desarrollo
    X = vectorizer.fit_transform(df['cleaned_lemmatized_text'])
    y = df['Tag Value']
    
    # Verificar las dimensiones
    print(f"Dimensiones de X: {X.shape}")

Configuración: ngram_range=(1, 1), representación=tfidf
Dimensiones de X: (2800, 10591)
Configuración: ngram_range=(1, 2), representación=tfidf
Dimensiones de X: (2800, 47819)
Configuración: ngram_range=(1, 3), representación=tfidf
Dimensiones de X: (2800, 98988)
Configuración: ngram_range=(1, 2), representación=binary
Dimensiones de X: (2800, 47819)
Configuración: ngram_range=(1, 2), representación=frequency
Dimensiones de X: (2800, 47819)


## Balancear dataset

In [4]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

## Dividir dataset en training y dev

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_dev, y_train, y_dev = train_test_split(
    X_resampled, y_resampled, test_size=0.25, random_state=0, stratify=y_resampled, shuffle=True
)

In [6]:
from sklearn.decomposition import TruncatedSVD

# Reducir dimensionalidad con TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=0)  # Reducir a 100 dimensiones
X_train_reduced = svd.fit_transform(X_train)
X_dev_reduced = svd.transform(X_dev)

# Verificar las dimensiones después de la reducción
print(f"Dimensiones reducidas de X_train: {X_train_reduced.shape}")
print(f"Dimensiones reducidas de X_dev: {X_dev_reduced.shape}")

Dimensiones reducidas de X_train: (3003, 100)
Dimensiones reducidas de X_dev: (1001, 100)


## Entrenar usando random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
rf = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    rf,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, y_train)

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_de

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Definir el modelo de Random Forest
rf_model = RandomForestClassifier(max_depth=30, min_samples_leaf=1, min_samples_split=2,n_estimators=100, random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
rf_scores = cross_val_score(rf_model, X_train_reduced, y_train, cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Random Forest:")
print(f"F1-macro scores por fold: {rf_scores}")
print(f"F1-macro promedio: {rf_scores.mean():.4f}")

Random Forest:
F1-macro scores por fold: [0.87341758 0.86991342 0.85521561 0.82984686 0.84315907]
F1-macro promedio: 0.8543


In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
rf_model.fit(X_train_reduced, y_train)

# Predecir sobre el conjunto de validación/desarrollo
y_pred = rf_model.predict(X_dev_reduced)
y_true = y_dev

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.7816    0.9501    0.8577       501
           1     0.9362    0.7340    0.8229       500

    accuracy                         0.8422      1001
   macro avg     0.8589    0.8420    0.8403      1001
weighted avg     0.8588    0.8422    0.8403      1001

Matriz de confusión:
[[476  25]
 [133 367]]


## Usando MLP

### Pruebas usando GridSearch

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
mlp = MLPClassifier(random_state=0, max_iter=2000)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150,)],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam', 'lbfgs'],
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    mlp,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, y_train)

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=  10.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=  10.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=  12.8s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=  13.2s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   3.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   5.8s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   4.0s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   6.4s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   6.1s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=  14.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(100,), solver=adam; total time=  13.6s
[CV] END alpha=0.0001, hidden_layer_sizes

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  16.9s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  17.6s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  17.0s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=   5.0s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  18.2s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  10.3s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  10.5s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  16.9s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  15.6s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  12.0s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  37.1s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  38.8s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  34.8s
[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  33.8s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   4.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=  11.2s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   9.2s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=  10.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   8.8s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  33.7s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  27.1s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  33.4s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  39.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  29.1s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=   9.6s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=   9.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  29.2s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  10.7s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  16.3s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  17.1s
[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  36.4s
[CV] END alph

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  51.7s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  25.6s
Mejor configuración encontrada:
{'alpha': 0.001, 'hidden_layer_sizes': (50,), 'solver': 'adam'}
Mejor F1-macro: 0.8708

Resultados detallados por configuración:
{'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'solver': 'adam'} -> F1-macro: 0.8694 (+/-0.0134)
{'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'solver': 'lbfgs'} -> F1-macro: 0.8197 (+/-0.0139)
{'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'adam'} -> F1-macro: 0.8651 (+/-0.0137)
{'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'lbfgs'} -> F1-macro: 0.8317 (+/-0.0088)
{'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'} -> F1-macro: 0.8634 (+/-0.0139)
{'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'solver': 'lbfgs'} -> F1-macro: 0.8334 (+/-0.0086)
{'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'adam'} -> F1-macro: 0.8694 (+/-0.0104)
{'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'lbfgs'

### Usando los mejores valores de gridsearch

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

# Definir el modelo de MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=2000, alpha=0.001, solver='adam', random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
mlp_scores = cross_val_score(mlp_model, X_train_reduced, y_train, cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Multi-layer Perceptron (MLPClassifier):")
print(f"F1-macro scores por fold: {mlp_scores}")
print(f"F1-macro promedio: {mlp_scores.mean():.4f}")

Multi-layer Perceptron (MLPClassifier):
F1-macro scores por fold: [0.88685242 0.87511948 0.87685517 0.86997688 0.8649366 ]
F1-macro promedio: 0.8747


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
mlp_model.fit(X_train_reduced, y_train)

# Predecir sobre el conjunto de validación/desarrollo
y_pred = mlp_model.predict(X_dev_reduced)
y_true = y_dev

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.8485    0.9281    0.8866       501
           1     0.9205    0.8340    0.8751       500

    accuracy                         0.8811      1001
   macro avg     0.8845    0.8811    0.8808      1001
weighted avg     0.8845    0.8811    0.8809      1001

Matriz de confusión:
[[465  36]
 [ 83 417]]


## Guardar modelo

In [9]:
import joblib

joblib.dump(mlp_model, '../model/mlp_model.joblib')

joblib.dump(vectorizer, '../model/vectorizer.joblib')
joblib.dump(svd, '../model/svd.joblib')

['../model/svd.joblib']

## Pruebas con el dataset de dev

In [3]:
# Cargar el conjunto de desarrollo
def_dev = pd.read_csv('../data/TA1C_dataset_detection_dev.csv')


In [6]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

# Preprocesar el texto
from preprocessing import normalize_text

def_dev["cleaned_lemmatized_text"] = def_dev['Teaser Text'].apply(lambda x: normalize_text(x, mode="text_cleaning")).apply(lambda x: normalize_text(x, mode="lemmatization"))
