# Entranamiento de modelos usando tokenizacion

In [1]:
import pandas as pd

# Cargar los conjuntos de entrenamiento y desarrollo
train_df = pd.read_csv("../data/TA1C_dataset_detection_train_split.csv")
dev_df = pd.read_csv("../data/TA1C_dataset_detection_dev_split.csv")

# Verificar las columnas disponibles
print(train_df.columns)

Index(['Tweet ID', 'Teaser Text', 'Tag Value', 'tokenized_text',
       'cleaned_text', 'no_stopwords_text', 'lemmatized_text',
       'tokenized_cleaned_text', 'tokenized_cleaned_text_no_stopwords',
       'tokenized_cleaned_text_no_stopwords_lemmatized'],
      dtype='object')


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Probar diferentes configuraciones de n-gramas y representaciones
configurations = [
    {"ngram_range": (1, 1), "representation": "tfidf"},  # Unigramas con TF-IDF
    {"ngram_range": (1, 2), "representation": "tfidf"},  # Unigramas + Bigramas con TF-IDF
    {"ngram_range": (1, 3), "representation": "tfidf"},  # Unigramas + Bigramas + Trigramas con TF-IDF
    {"ngram_range": (1, 2), "representation": "binary"},  # Unigramas + Bigramas con representación binaria
    {"ngram_range": (1, 2), "representation": "frequency"},  # Unigramas + Bigramas con frecuencia
]

for config in configurations:
    print(f"Configuración: ngram_range={config['ngram_range']}, representación={config['representation']}")
    
    # Configurar el vectorizador
    if config["representation"] == "tfidf":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"])
    elif config["representation"] == "binary":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"], binary=True)
    elif config["representation"] == "frequency":
        vectorizer = TfidfVectorizer(ngram_range=config["ngram_range"], use_idf=False)
    
    # Crear representaciones para el conjunto de entrenamiento y desarrollo
    X_train = vectorizer.fit_transform(train_df['tokenized_text'])
    X_dev = vectorizer.transform(dev_df['tokenized_text'])
    
    # Verificar las dimensiones
    print(f"Dimensiones de X_train: {X_train.shape}")
    print(f"Dimensiones de X_dev: {X_dev.shape}")

Configuración: ngram_range=(1, 1), representación=tfidf
Dimensiones de X_train: (2100, 11434)
Dimensiones de X_dev: (700, 11434)
Configuración: ngram_range=(1, 2), representación=tfidf
Dimensiones de X_train: (2100, 44247)
Dimensiones de X_dev: (700, 44247)
Configuración: ngram_range=(1, 3), representación=tfidf
Dimensiones de X_train: (2100, 85357)
Dimensiones de X_dev: (700, 85357)
Configuración: ngram_range=(1, 2), representación=binary
Dimensiones de X_train: (2100, 44247)
Dimensiones de X_dev: (700, 44247)
Configuración: ngram_range=(1, 2), representación=frequency
Dimensiones de X_train: (2100, 44247)
Dimensiones de X_dev: (700, 44247)


In [3]:
from sklearn.decomposition import TruncatedSVD

# Reducir dimensionalidad con TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=0)  # Reducir a 100 dimensiones
X_train_reduced = svd.fit_transform(X_train)
X_dev_reduced = svd.transform(X_dev)

# Verificar las dimensiones después de la reducción
print(f"Dimensiones reducidas de X_train: {X_train_reduced.shape}")
print(f"Dimensiones reducidas de X_dev: {X_dev_reduced.shape}")

Dimensiones reducidas de X_train: (2100, 100)
Dimensiones reducidas de X_dev: (700, 100)


In [4]:
# Convertir etiquetas de texto a valores numéricos
label_mapping = {'Clickbait': 1, 'No': 0}
train_df['Tag Value'] = train_df['Tag Value'].map(label_mapping)
dev_df['Tag Value'] = dev_df['Tag Value'].map(label_mapping)

## Logistic Regression y Validación Cruzada

In [5]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score

# Configurar validación cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Definir el modelo
model = LogisticRegression(max_iter=200)

# Evaluar el modelo con validación cruzada usando f1_macro
f1_macro_scorer = make_scorer(f1_score, average='macro')
scores = cross_val_score(model, X_train_reduced, train_df['Tag Value'], cv=cv, scoring=f1_macro_scorer)

# Imprimir los resultados
print(f"F1-macro scores por fold: {scores}")
print(f"F1-macro promedio: {scores.mean():.4f}")

F1-macro scores por fold: [0.57167416 0.54571429 0.60206861 0.56777917 0.55255682]
F1-macro promedio: 0.5680


## Naive Bayes Multinominal

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Definir el modelo de Naïve Bayes Multinomial
nb_model = MultinomialNB()

# Evaluar el modelo con validación cruzada usando las representaciones originales (X_train)
nb_scores = cross_val_score(nb_model, X_train, train_df['Tag Value'], cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Naïve Bayes Multinomial (sin TruncatedSVD):")
print(f"F1-macro scores por fold: {nb_scores}")
print(f"F1-macro promedio: {nb_scores.mean():.4f}")

Naïve Bayes Multinomial (sin TruncatedSVD):
F1-macro scores por fold: [0.41666667 0.41666667 0.42551064 0.42638889 0.42638889]
F1-macro promedio: 0.4223


## Support Vector Machine

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
svc = SVC(random_state=0)
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    svc,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, train_df['Tag Value'])

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.1s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Definir el modelo de SVC
svc_model = SVC(C=10,gamma='scale',kernel='rbf', random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
svc_scores = cross_val_score(svc_model, X_train_reduced, train_df['Tag Value'], cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Support Vector Machines (SVC):")
print(f"F1-macro scores por fold: {svc_scores}")
print(f"F1-macro promedio: {svc_scores.mean():.4f}")

Support Vector Machines (SVC):
F1-macro scores por fold: [0.71835231 0.68280662 0.69426246 0.7019026  0.69462251]
F1-macro promedio: 0.6984


In [18]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
svc_model.fit(X_train_reduced, train_df['Tag Value'])

# Predecir sobre el conjunto de validación/desarrollo
y_pred = svc_model.predict(X_dev_reduced)
y_true = dev_df['Tag Value']

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.7935    0.8760    0.8327       500
           1     0.5811    0.4300    0.4943       200

    accuracy                         0.7486       700
   macro avg     0.6873    0.6530    0.6635       700
weighted avg     0.7328    0.7486    0.7360       700

Matriz de confusión:
[[438  62]
 [114  86]]


## Multi-layer Perceptron (MLPClassifier)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
mlp = MLPClassifier(random_state=0, max_iter=2000)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150,)],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam', 'lbfgs'],
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    mlp,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, train_df['Tag Value'])

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=   7.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=   8.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=   9.1s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=   9.2s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   1.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   1.7s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   2.1s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   1.7s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=lbfgs; total time=   1.8s
[CV] END alpha=0.0001, hidden_layer_sizes=(50,), solver=adam; total time=   9.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(100,), solver=adam; total time=   9.2s
[CV] END alpha=0.0001, hidden_layer_sizes

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  10.5s
[CV] END ..alpha=0.01, hidden_layer_sizes=(50,), solver=adam; total time=  11.1s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=   9.9s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  10.5s
[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  10.6s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  10.2s
[CV] END .alpha=0.01, hidden_layer_sizes=(50,), solver=lbfgs; total time=  10.8s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  11.3s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  11.6s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  12.8s
[CV] END .alpha=0.01, hidden_layer_sizes=(100,), solver=adam; total time=  13.6s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  21.2s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  22.1s
[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  21.1s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  22.3s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   5.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   5.1s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   4.7s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   5.3s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=adam; total time=   5.2s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(100,), solver=lbfgs; total time=  21.8s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  15.6s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  15.7s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  23.5s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  18.8s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  15.1s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  13.0s
[CV] END alpha=0.01, hidden_layer_sizes=(100, 50), solver=lbfgs; total time=  18.7s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  13.9s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  16.6s
[CV] END .alpha=0.01, hidden_layer_sizes=(150,), solver=adam; total time=  15.8s
[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  24.3s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  35.5s
[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  27.3s
[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  19.5s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END alpha=0.01, hidden_layer_sizes=(150,), solver=lbfgs; total time=  32.1s
Mejor configuración encontrada:
{'alpha': 0.001, 'hidden_layer_sizes': (150,), 'solver': 'adam'}
Mejor F1-macro: 0.7135

Resultados detallados por configuración:
{'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'solver': 'adam'} -> F1-macro: 0.7054 (+/-0.0122)
{'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'solver': 'lbfgs'} -> F1-macro: 0.6570 (+/-0.0228)
{'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'adam'} -> F1-macro: 0.7016 (+/-0.0104)
{'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'solver': 'lbfgs'} -> F1-macro: 0.6697 (+/-0.0177)
{'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'} -> F1-macro: 0.6969 (+/-0.0125)
{'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'solver': 'lbfgs'} -> F1-macro: 0.6631 (+/-0.0187)
{'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'adam'} -> F1-macro: 0.7127 (+/-0.0110)
{'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'lbfgs

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

# Definir el modelo de MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(150,), max_iter=2000, alpha=0.001, solver='lbfgs', random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
mlp_scores = cross_val_score(mlp_model, X_train_reduced, train_df['Tag Value'], cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Multi-layer Perceptron (MLPClassifier):")
print(f"F1-macro scores por fold: {mlp_scores}")
print(f"F1-macro promedio: {mlp_scores.mean():.4f}")

Multi-layer Perceptron (MLPClassifier):
F1-macro scores por fold: [0.70350835 0.64692724 0.65010346 0.69666667 0.68108475]
F1-macro promedio: 0.6757


In [17]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
mlp_model.fit(X_train_reduced, train_df['Tag Value'])

# Predecir sobre el conjunto de validación/desarrollo
y_pred = mlp_model.predict(X_dev_reduced)
y_true = dev_df['Tag Value']

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.8130    0.8000    0.8065       500
           1     0.5192    0.5400    0.5294       200

    accuracy                         0.7257       700
   macro avg     0.6661    0.6700    0.6679       700
weighted avg     0.7291    0.7257    0.7273       700

Matriz de confusión:
[[400 100]
 [ 92 108]]


## Random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
rf = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    rf,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, train_df['Tag Value'])

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END max_de

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Definir el modelo de Random Forest
rf_model = RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=2,n_estimators=100, random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
rf_scores = cross_val_score(rf_model, X_train_reduced, train_df['Tag Value'], cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Random Forest:")
print(f"F1-macro scores por fold: {rf_scores}")
print(f"F1-macro promedio: {rf_scores.mean():.4f}")

Random Forest:
F1-macro scores por fold: [0.67716564 0.62040129 0.64820278 0.66035905 0.66029412]
F1-macro promedio: 0.6533


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
rf_model.fit(X_train_reduced, train_df['Tag Value'])

# Predecir sobre el conjunto de validación/desarrollo
y_pred = rf_model.predict(X_dev_reduced)
y_true = dev_df['Tag Value']

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.7732    0.9820    0.8652       500
           1     0.8615    0.2800    0.4226       200

    accuracy                         0.7814       700
   macro avg     0.8174    0.6310    0.6439       700
weighted avg     0.7985    0.7814    0.7388       700

Matriz de confusión:
[[491   9]
 [144  56]]


## Gradient Boosting

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Definir el modelo y los hiperparámetros a probar
gb = GradientBoostingClassifier(random_state=0)
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 5],
    'subsample': [0.8, 1.0]
}

# Configurar validación cruzada y búsqueda de hiperparámetros
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    gb,
    param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid_search.fit(X_train_reduced, train_df['Tag Value'])

# Imprimir mejores resultados
print("Mejor configuración encontrada:")
print(grid_search.best_params_)
print(f"Mejor F1-macro: {grid_search.best_score_:.4f}")

print("\nResultados detallados por configuración:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print(f"{param} -> F1-macro: {mean:.4f} (+/-{std:.4f})")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.8s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.9s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.9s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.9s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.9s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=1.0; total time=   2.5s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=1.0; total time=   2.5s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=1.0; total time=   2.6s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=1.0; total time=   2.4s
[CV] END learning_rate=0.01, max_depth=2, n_estimators=50, subsample=1.0; total time=   2.4s
[CV] END

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# Definir el modelo de Gradient Boosting
gb_model = GradientBoostingClassifier(subsample=0.8,n_estimators=200, learning_rate=0.1, max_depth=5, random_state=0)

# Evaluar el modelo con validación cruzada usando las representaciones reducidas (X_train_reduced)
gb_scores = cross_val_score(gb_model, X_train_reduced, train_df['Tag Value'], cv=5, scoring='f1_macro')

# Imprimir los resultados
print("Gradient Boosting:")
print(f"F1-macro scores por fold: {gb_scores}")
print(f"F1-macro promedio: {gb_scores.mean():.4f}")

Gradient Boosting:
F1-macro scores por fold: [0.66813883 0.6556766  0.68661709 0.6765471  0.68211193]
F1-macro promedio: 0.6738


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar el modelo en todo el conjunto de entrenamiento
gb_model.fit(X_train_reduced, train_df['Tag Value'])

# Predecir sobre el conjunto de validación/desarrollo
y_pred = gb_model.predict(X_dev_reduced)
y_true = dev_df['Tag Value']

# Imprimir el reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_true, y_pred, digits=4))

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0     0.7922    0.9300    0.8556       500
           1     0.6903    0.3900    0.4984       200

    accuracy                         0.7757       700
   macro avg     0.7412    0.6600    0.6770       700
weighted avg     0.7630    0.7757    0.7535       700

Matriz de confusión:
[[465  35]
 [122  78]]
