In [1]:
# Importar librerías mágicas
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, f1_score, classification_report
import optuna
import joblib
import xgboost as xgb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/fintihlupik/NLP-sentimental/refs/heads/master/data/youtoxic_english_1000%20-%20youtoxic_english_1000.csv")


In [3]:
# Definir directorios
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
data_dir = os.path.join(BASE_DIR, 'processed')  # Corregido
models_dir = os.path.join(BASE_DIR, 'models')

# Cargar datos vectorizados
X_train = joblib.load(os.path.join(data_dir, 'X_train_tfidf.pkl'))
X_test = joblib.load(os.path.join(data_dir, 'X_test_tfidf.pkl'))

# Cargar etiquetas
y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))['IsToxic']
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))['IsToxic']

In [4]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, f1_score, classification_report

# Configuración de validación cruzada estratificada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelo inicial con Naive Bayes
nb_model = MultinomialNB()

# Evaluación inicial con validación cruzada
y_pred_proba = cross_val_predict(nb_model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]

# Calcular métricas de evaluación
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
f1_scores = 2 * recall * precision / (recall + precision + 1e-6)

# Imprimir resultados iniciales
print(f"F1-Score inicial: {max(f1_scores):.4f}")

# Reporte de clasificación inicial
optimal_threshold = thresholds[np.argmax(f1_scores)]
y_pred = (y_pred_proba >= optimal_threshold).astype(int)
print("Reporte de clasificación inicial:")
print(classification_report(y_train, y_pred))

F1-Score inicial: 0.7077
Reporte de clasificación inicial:
              precision    recall  f1-score   support

           0       0.85      0.42      0.56       430
           1       0.58      0.92      0.71       370

    accuracy                           0.65       800
   macro avg       0.72      0.67      0.64       800
weighted avg       0.73      0.65      0.63       800



In [5]:
import optuna

# Definir función objetivo para Optuna
def objective(trial):
    # Sugerir un valor para el hiperparámetro alpha
    alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
    
    # Crear el modelo con el valor de alpha sugerido
    model = MultinomialNB(alpha=alpha)
    
    # Validación cruzada estratificada
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]
    
    # Calcular F1-score
    precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
    f1_scores = 2 * recall * precision / (recall + precision + 1e-6)
    
    # Retornar el mejor F1-score
    return max(f1_scores)

# Crear un estudio de Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Ejecutar 50 pruebas

# Obtener el mejor hiperparámetro
best_alpha = study.best_params['alpha']
print(f"Mejor valor de alpha: {best_alpha}")

[I 2025-07-08 11:31:57,294] A new study created in memory with name: no-name-27165e6e-2862-4473-bde2-c70415ff8f71
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-08 11:31:57,351] Trial 0 finished with value: 0.7064671648601302 and parameters: {'alpha': 1.8515862591858971}. Best is trial 0 with value: 0.7064671648601302.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-08 11:31:57,912] Trial 1 finished with value: 0.7098316406446624 and parameters: {'alpha': 0.7358675910750551}. Best is trial 1 with value: 0.7098316406446624.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-08 11:31:58,481] Trial 2 finished with value: 0.7113767519814103 and parameters: {'alpha': 0.11882401868704891}. Best is trial 2 with value: 0.7113767519814103.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-08 11:31:58,613] Trial 3 finished with value: 0.7096769302282037 and parameters: {'alpha': 0.05014260017189884}. Best is trial 2 with val

Mejor valor de alpha: 0.5090755403804245


In [6]:
# Entrenar el modelo con el mejor valor de alpha encontrado por Optuna
best_model = MultinomialNB(alpha=best_alpha)
best_model.fit(X_train, y_train)

# Obtener las probabilidades predichas en el conjunto de entrenamiento
y_pred_proba = best_model.predict_proba(X_train)[:, 1]

# Calcular precisión, recall y F1-score para diferentes umbrales
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
f1_scores = 2 * recall * precision / (recall + precision + 1e-6)

# Encontrar el umbral que maximiza el F1-score
optimal_threshold = thresholds[np.argmax(f1_scores)]
print(f"Mejor umbral para F1-score: {optimal_threshold:.4f}")

# Evaluar el modelo con el umbral óptimo
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
print("Reporte de clasificación con umbral optimizado:")
print(classification_report(y_train, y_pred_optimal))

Mejor umbral para F1-score: 0.4873
Reporte de clasificación con umbral optimizado:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       430
           1       0.96      0.98      0.97       370

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800



In [7]:
from sklearn.metrics import accuracy_score

# 1. Evaluación con el modelo inicial (umbral predeterminado de 0.5)
initial_model = MultinomialNB()
initial_model.fit(X_train, y_train)
y_pred_initial = initial_model.predict(X_train)
initial_f1 = f1_score(y_train, y_pred_initial)
initial_accuracy = accuracy_score(y_train, y_pred_initial)

# 2. Evaluación con el modelo optimizado (mejor alpha, umbral predeterminado de 0.5)
optimized_model = MultinomialNB(alpha=best_alpha)
optimized_model.fit(X_train, y_train)
y_pred_optimized = optimized_model.predict(X_train)
optimized_f1 = f1_score(y_train, y_pred_optimized)
optimized_accuracy = accuracy_score(y_train, y_pred_optimized)

# 3. Evaluación con el modelo optimizado y el mejor umbral
y_pred_best_threshold = (y_pred_proba >= optimal_threshold).astype(int)
best_threshold_f1 = f1_score(y_train, y_pred_best_threshold)
best_threshold_accuracy = accuracy_score(y_train, y_pred_best_threshold)

# Comparación de métricas
print("Comparación de métricas:")
print(f"{'Modelo':<30}{'F1-Score':<15}{'Accuracy':<15}")
print(f"{'Modelo inicial (umbral 0.5)':<30}{initial_f1:<15.4f}{initial_accuracy:<15.4f}")
print(f"{'Modelo optimizado (umbral 0.5)':<30}{optimized_f1:<15.4f}{optimized_accuracy:<15.4f}")
print(f"{'Modelo optimizado (mejor umbral)':<30}{best_threshold_f1:<15.4f}{best_threshold_accuracy:<15.4f}")

Comparación de métricas:
Modelo                        F1-Score       Accuracy       
Modelo inicial (umbral 0.5)   0.9392         0.9450         
Modelo optimizado (umbral 0.5)0.9622         0.9650         
Modelo optimizado (mejor umbral)0.9665         0.9688         


In [8]:
# Selección del mejor modelo según F1-score
if best_threshold_f1 >= max(initial_f1, optimized_f1):
    final_model = optimized_model
    final_threshold = optimal_threshold
    print("El modelo optimizado con el mejor umbral fue seleccionado como el mejor modelo.")
else:
    final_model = optimized_model if optimized_f1 > initial_f1 else initial_model
    final_threshold = 0.5
    print("El modelo optimizado o inicial con umbral 0.5 fue seleccionado como el mejor modelo.")

# Entrenar el modelo final con los datos de entrenamiento completos
final_model.fit(X_train, y_train)

# Evaluar el modelo final en el conjunto de prueba
y_test_proba = final_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= final_threshold).astype(int)

# Reporte de clasificación en el conjunto de prueba
from sklearn.metrics import classification_report
print("Reporte de clasificación en el conjunto de prueba:")
print(classification_report(y_test, y_test_pred))

El modelo optimizado con el mejor umbral fue seleccionado como el mejor modelo.
Reporte de clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.67      0.70      0.69       108
           1       0.63      0.60      0.61        92

    accuracy                           0.66       200
   macro avg       0.65      0.65      0.65       200
weighted avg       0.65      0.66      0.65       200



In [9]:
models_dir = os.path.join(BASE_DIR, 'models')
print(f"Ruta de models_dir: {models_dir}")

Ruta de models_dir: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models


In [10]:
import joblib
import os

# Ajustar la ruta de models_dir para que apunte fuera de 'data'
models_dir = os.path.join(BASE_DIR, 'models')  # Asegúrate de que apunta al directorio correcto

# Crear la carpeta 'models' si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Carpeta 'models' creada en: {models_dir}")
else:
    print(f"Usando la carpeta existente: {models_dir}")

# Ruta para guardar el modelo
model_path = os.path.join(models_dir, 'naive_bayes_best_model.pkl')

# Guardar el modelo final
joblib.dump(final_model, model_path)
print(f"Modelo guardado en: {model_path}")

Usando la carpeta existente: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models
Modelo guardado en: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models\naive_bayes_best_model.pkl


In [33]:
import pandas as pd
import mlflow
import mlflow.sklearn
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score, recall_score
)
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import SMOTE
from sklearn.base import clone

# ✅ Llamada correcta a autolog
mlflow.autolog(log_models=False)

# MLflow config
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Toxicity Classification Benchmarking")

# Definir directorios
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
data_dir = os.path.join(BASE_DIR, 'processed')
models_dir = os.path.join(BASE_DIR, 'models')

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {data_dir}")
print(f"Models directory: {models_dir}")

# Verificar que los directorios existen
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Directory not found: {data_dir}")

if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Created models directory: {models_dir}")

# Verificar archivos necesarios
required_files = [
    'X_train_tfidf.pkl',
    'X_test_tfidf.pkl',
    'y_train.csv',
    'y_test.csv'
]

print("\nVerificando archivos necesarios...")
for file in required_files:
    file_path = os.path.join(data_dir, file)
    if os.path.exists(file_path):
        print(f"✅ {file}")
    else:
        print(f"❌ {file} - NOT FOUND")
        raise FileNotFoundError(f"Required file not found: {file_path}")

# Cargar datos vectorizados
print("\nCargando datos...")
try:
    X_train = joblib.load(os.path.join(data_dir, 'X_train_tfidf.pkl'))
    X_test = joblib.load(os.path.join(data_dir, 'X_test_tfidf.pkl'))
    
    # Cargar etiquetas
    y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))['IsToxic']
    y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))['IsToxic']
    
    print(f"✅ Datos cargados exitosamente")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    print(f"Distribución de clases en train: {y_train.value_counts()}")
    print(f"Distribución de clases en test: {y_test.value_counts()}")
    
except Exception as e:
    print(f"❌ Error cargando datos: {str(e)}")
    raise

# Aplicar SMOTE para balancear las clases
print("\nAplicando SMOTE...")
try:
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print(f"✅ SMOTE aplicado exitosamente")
    print(f"Distribución después de SMOTE: {pd.Series(y_train_res).value_counts()}")
    print(f"X_train_res shape: {X_train_res.shape}")
except Exception as e:
    print(f"❌ Error aplicando SMOTE: {str(e)}")
    print("Continuando sin SMOTE...")
    X_train_res, y_train_res = X_train, y_train

# Hiperparámetros adaptados para clasificación de texto
params = {
    "RandomForestClassifier": {
        'n_estimators': [135], 
        'max_depth': [15], 
        'min_samples_split': [7],
        'min_samples_leaf': [6], 
        'max_features': ['sqrt'], 
        'bootstrap': [True], 
        'random_state': [42]
    },
    "KNeighborsClassifier": {
        'n_neighbors': [7], 
        'weights': ['uniform'], 
        'metric': ['cosine']  # Mejor para datos de texto
    },
    "GradientBoostingClassifier": {
        'n_estimators': [110], 
        'learning_rate': [0.1],  # Reducido para estabilidad
        'max_depth': [6],  # Reducido para datos de texto
        'min_samples_split': [8], 
        'min_samples_leaf': [6], 
        'subsample': [0.8],
        'max_features': ['sqrt'], 
        'random_state': [42]
    },
    "LGBMClassifier": {
        'n_estimators': [125], 
        'learning_rate': [0.1], 
        'num_leaves': [31], 
        'max_depth': [-1],
        'min_child_samples': [20], 
        'subsample': [0.8], 
        'colsample_bytree': [0.8], 
        'random_state': [42],
        'verbosity': [-1]
    },
    "XGBClassifier": {
        'n_estimators': [125], 
        'learning_rate': [0.1], 
        'max_depth': [6], 
        'subsample': [0.8],
        'colsample_bytree': [0.8], 
        'gamma': [0], 
        'reg_alpha': [0.1], 
        'reg_lambda': [1],
        'use_label_encoder': [False], 
        'eval_metric': ['logloss'], 
        'random_state': [42]
    },
    "LogisticRegression": {
        'C': [1.0], 
        'max_iter': [1000],  # Aumentado para convergencia
        'solver': ['liblinear'],  # Mejor para datos sparse
        'random_state': [42]
    },
    "GaussianNB": [{}],
    "SVC": {
        'C': [1.0], 
        'kernel': ['linear'],  # Linear mejor para texto
        'probability': [True], 
        'random_state': [42]
    },
    "ExtraTreesClassifier": {
        'n_estimators': [100], 
        'max_depth': [10], 
        'random_state': [42]
    }
}

# Modelos
modelos = {
    "LogisticRegression": LogisticRegression(),
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
    "LGBMClassifier": LGBMClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "GaussianNB": GaussianNB(),
    "SVC": SVC(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),
}

# Entrenamiento y tracking
print("\n🚀 Iniciando entrenamiento de modelos...")

results = []

for nombre, modelo in modelos.items():
    grid = params.get(nombre, [{}])
    for param_set in ParameterGrid(grid):
        m = clone(modelo)
        m.set_params(**param_set)

        with mlflow.start_run(run_name=f"{nombre}_toxicity"):
            print(f"\nEntrenando {nombre} con parámetros: {param_set}")
            
            try:
                # Entrenar modelo
                m.fit(X_train_res, y_train_res)

                # Predicciones
                y_train_pred = m.predict(X_train_res)
                y_test_pred = m.predict(X_test)

                # Métricas
                acc_train = accuracy_score(y_train_res, y_train_pred)
                acc_test = accuracy_score(y_test, y_test_pred)
                f1_train = f1_score(y_train_res, y_train_pred, average='weighted')
                f1_test = f1_score(y_test, y_test_pred, average='weighted')
                acc_gap = acc_train - acc_test
                f1_gap = f1_train - f1_test

                # Reporte de clasificación
                report = classification_report(y_test, y_test_pred, output_dict=True)
                
                # Recall por clase (0: no tóxico, 1: tóxico)
                recall_0 = report.get('0', {}).get('recall', 0)
                recall_1 = report.get('1', {}).get('recall', 0)
                
                # Precisión por clase
                precision_0 = report.get('0', {}).get('precision', 0)
                precision_1 = report.get('1', {}).get('precision', 0)
                
                # F1 por clase
                f1_0 = report.get('0', {}).get('f1-score', 0)
                f1_1 = report.get('1', {}).get('f1-score', 0)

                # MLflow logs
                mlflow.log_param("modelo", nombre)
                mlflow.log_params(param_set)
                mlflow.log_metric("accuracy_train", acc_train)
                mlflow.log_metric("accuracy_test", acc_test)
                mlflow.log_metric("f1_train", f1_train)
                mlflow.log_metric("f1_test", f1_test)
                mlflow.log_metric("acc_gap", acc_gap)
                mlflow.log_metric("f1_gap", f1_gap)
                mlflow.log_metric("recall_non_toxic", recall_0)
                mlflow.log_metric("recall_toxic", recall_1)
                mlflow.log_metric("precision_non_toxic", precision_0)
                mlflow.log_metric("precision_toxic", precision_1)
                mlflow.log_metric("f1_non_toxic", f1_0)
                mlflow.log_metric("f1_toxic", f1_1)

                # Guardar el modelo
                mlflow.sklearn.log_model(m, artifact_path="modelo")
                
                # Guardar también en el directorio de modelos
                model_path = os.path.join(models_dir, f"{nombre}_toxicity.pkl")
                joblib.dump(m, model_path)
                
                print(f"✅ {nombre} completado")
                print(f"   Accuracy: {acc_test:.4f}")
                print(f"   F1 Score: {f1_test:.4f}")
                print(f"   Recall Toxic: {recall_1:.4f}")
                print(f"   Precision Toxic: {precision_1:.4f}")
                
                # Guardar resultados para comparación
                results.append({
                    'modelo': nombre,
                    'accuracy_test': acc_test,
                    'f1_test': f1_test,
                    'recall_toxic': recall_1,
                    'precision_toxic': precision_1,
                    'acc_gap': acc_gap,
                    'f1_gap': f1_gap
                })
                
            except Exception as e:
                print(f"❌ Error entrenando {nombre}: {str(e)}")
                mlflow.log_param("error", str(e))
                continue

# Resumen de resultados
print("\n" + "="*60)
print("🎉 ENTRENAMIENTO COMPLETADO!")
print("="*60)

if results:
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('f1_test', ascending=False)
    
    print("\n📊 RANKING DE MODELOS (por F1 Score):")
    print("-" * 60)
    for idx, row in results_df.iterrows():
        print(f"{row['modelo']:<25} | F1: {row['f1_test']:.4f} | Acc: {row['accuracy_test']:.4f} | Recall Toxic: {row['recall_toxic']:.4f}")
    
    # Guardar resultados
    results_path = os.path.join(models_dir, 'model_comparison.csv')
    results_df.to_csv(results_path, index=False)
    print(f"\n💾 Resultados guardados en: {results_path}")
    
    best_model = results_df.iloc[0]['modelo']
    print(f"\n🏆 MEJOR MODELO: {best_model}")
    print(f"   F1 Score: {results_df.iloc[0]['f1_test']:.4f}")
    print(f"   Accuracy: {results_df.iloc[0]['accuracy_test']:.4f}")
    print(f"   Recall Toxic: {results_df.iloc[0]['recall_toxic']:.4f}")

else:
    print("❌ No se completó el entrenamiento de ningún modelo")

print(f"\n🔍 Revisa los experimentos en MLflow: http://localhost:5000")



2025/07/08 13:03:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/08 13:03:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/08 13:03:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/07/08 13:03:32 INFO mlflow.tracking.fluent: Experiment with name 'Toxicity Classification Benchmarking' does not exist. Creating a new experiment.


Base directory: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data
Data directory: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\processed
Models directory: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models

Verificando archivos necesarios...
✅ X_train_tfidf.pkl
✅ X_test_tfidf.pkl
✅ y_train.csv
✅ y_test.csv

Cargando datos...
✅ Datos cargados exitosamente
X_train shape: (800, 2968)
X_test shape: (200, 2968)
y_train shape: (800,)
y_test shape: (200,)
Distribución de clases en train: IsToxic
0    430
1    370
Name: count, dtype: int64
Distribución de clases en test: IsToxic
0    108
1     92
Name: count, dtype: int64

Aplicando SMOTE...


2025/07/08 13:03:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a3ceae71a5df4efdacdc14b942a66c56', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run languid-snipe-245 at: http://localhost:5000/#/experiments/968302732310395226/runs/a3ceae71a5df4efdacdc14b942a66c56
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226
✅ SMOTE aplicado exitosamente
Distribución después de SMOTE: IsToxic
1    430
0    430
Name: count, dtype: int64
X_train_res shape: (860, 2968)

🚀 Iniciando entrenamiento de modelos...





Entrenando LogisticRegression con parámetros: {'C': 1.0, 'max_iter': 1000, 'random_state': 42, 'solver': 'liblinear'}




✅ LogisticRegression completado
   Accuracy: 0.7350
   F1 Score: 0.7330
   Recall Toxic: 0.6522
   Precision Toxic: 0.7407
🏃 View run LogisticRegression_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/3f84ad321455463098d7948e3c8f812f
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando RandomForestClassifier con parámetros: {'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 7, 'n_estimators': 135, 'random_state': 42}




✅ RandomForestClassifier completado
   Accuracy: 0.7450
   F1 Score: 0.7372
   Recall Toxic: 0.5761
   Precision Toxic: 0.8154
🏃 View run RandomForestClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/f95b99cf07d4403c88430eb6ed07f7a7
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226

Entrenando XGBClassifier con parámetros: {'colsample_bytree': 0.8, 'eval_metric': 'logloss', 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 125, 'random_state': 42, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8, 'use_label_encoder': False}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBClassifier completado
   Accuracy: 0.7300
   F1 Score: 0.7233
   Recall Toxic: 0.5761
   Precision Toxic: 0.7794
🏃 View run XGBClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/dbe31fe7a29642f59065e7c9243d0be8
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226

Entrenando LGBMClassifier con parámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 125, 'num_leaves': 31, 'random_state': 42, 'subsample': 0.8, 'verbosity': -1}




✅ LGBMClassifier completado
   Accuracy: 0.7150
   F1 Score: 0.7109
   Recall Toxic: 0.5978
   Precision Toxic: 0.7333
🏃 View run LGBMClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/72530331d049460f9ee821cc0196789f
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando KNeighborsClassifier con parámetros: {'metric': 'cosine', 'n_neighbors': 7, 'weights': 'uniform'}




✅ KNeighborsClassifier completado
   Accuracy: 0.6400
   F1 Score: 0.6405
   Recall Toxic: 0.6630
   Precision Toxic: 0.5980
🏃 View run KNeighborsClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/5332b58f9a7f4648a10ce230cfb84854
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando GradientBoostingClassifier con parámetros: {'learning_rate': 0.1, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 110, 'random_state': 42, 'subsample': 0.8}




✅ GradientBoostingClassifier completado
   Accuracy: 0.7400
   F1 Score: 0.7366
   Recall Toxic: 0.6304
   Precision Toxic: 0.7632
🏃 View run GradientBoostingClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/441460812868432c95ca972f940db9c8
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando GaussianNB con parámetros: {}
❌ Error entrenando GaussianNB: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.
🏃 View run GaussianNB_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/9aa8baceec1b4c29a13fdb2f436d40f3
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando SVC con parámetros: {'C': 1.0, 'kernel': 'linear', 'probability': True, 'random_state': 42}




✅ SVC completado
   Accuracy: 0.7250
   F1 Score: 0.7234
   Recall Toxic: 0.6522
   Precision Toxic: 0.7229
🏃 View run SVC_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/cebccab118fa4afd8e953266968ba534
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226





Entrenando ExtraTreesClassifier con parámetros: {'max_depth': 10, 'n_estimators': 100, 'random_state': 42}




✅ ExtraTreesClassifier completado
   Accuracy: 0.7000
   F1 Score: 0.6878
   Recall Toxic: 0.5000
   Precision Toxic: 0.7667
🏃 View run ExtraTreesClassifier_toxicity at: http://localhost:5000/#/experiments/968302732310395226/runs/61a494a0e8cf49329f67c17da926fc68
🧪 View experiment at: http://localhost:5000/#/experiments/968302732310395226

🎉 ENTRENAMIENTO COMPLETADO!

📊 RANKING DE MODELOS (por F1 Score):
------------------------------------------------------------
RandomForestClassifier    | F1: 0.7372 | Acc: 0.7450 | Recall Toxic: 0.5761
GradientBoostingClassifier | F1: 0.7366 | Acc: 0.7400 | Recall Toxic: 0.6304
LogisticRegression        | F1: 0.7330 | Acc: 0.7350 | Recall Toxic: 0.6522
SVC                       | F1: 0.7234 | Acc: 0.7250 | Recall Toxic: 0.6522
XGBClassifier             | F1: 0.7233 | Acc: 0.7300 | Recall Toxic: 0.5761
LGBMClassifier            | F1: 0.7109 | Acc: 0.7150 | Recall Toxic: 0.5978
ExtraTreesClassifier      | F1: 0.6878 | Acc: 0.7000 | Recall Toxic: 0.5000

In [12]:
import psutil
import mlflow

with mlflow.start_run():
    # ... tu código del modelo ...

    ram_usage = psutil.virtual_memory().used / (1024 ** 3)  # en GB
    cpu_usage = psutil.cpu_percent()

    mlflow.log_metric("ram_used_gb", ram_usage)
    mlflow.log_metric("cpu_percent", cpu_usage)
