In [1]:
import sys
import os
sys.path.append('..')

from src.data_preprocessing import load_and_preprocess_data, prepare_train_test_split
from src.model_utils import MLflowModelTracker
import mlflow
import pandas as pd

In [2]:
# Celda 2: Cargar y explorar datos
df, toxicity_columns = load_and_preprocess_data('../data/raw/youtoxic_english_1000.csv')
print(f"Dataset cargado: {df.shape}")
print(f"Columnas de toxicidad: {toxicity_columns}")

🔍 Analizando distribución de clases...
  ✅ IsToxic: 462 positivos, 538 negativos
  ✅ IsAbusive: 353 positivos, 647 negativos
  ✅ IsThreat: 21 positivos, 979 negativos
  ✅ IsProvocative: 161 positivos, 839 negativos
  ✅ IsObscene: 100 positivos, 900 negativos
  ✅ IsHatespeech: 138 positivos, 862 negativos
  ✅ IsRacist: 125 positivos, 875 negativos
  ❌ IsNationalist: 8 positivos, 992 negativos (EXCLUIDO)
  ❌ IsSexist: 1 positivos, 999 negativos (EXCLUIDO)
  ❌ IsHomophobic: 0 positivos, 1000 negativos (EXCLUIDO)
  ✅ IsReligiousHate: 12 positivos, 988 negativos
  ❌ IsRadicalism: 0 positivos, 1000 negativos (EXCLUIDO)

📊 Columnas válidas para ML: 8
   ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsReligiousHate']
Dataset cargado: (1000, 16)
Columnas de toxicidad: ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsReligiousHate']


In [3]:
# Celda 3: Preparar datos
X_train, X_test, y_train, y_test = prepare_train_test_split(df, toxicity_columns)
print(f"Entrenamiento: {X_train.shape}")
print(f"Prueba: {X_test.shape}")


🎯 Verificación final de datos:
   Forma de X: (1000,)
   Forma de y: (1000, 8)
   Rango de y: 0.0 - 1.0
   IsToxic: 2 clases únicas: [0. 1.]
   IsAbusive: 2 clases únicas: [0. 1.]
   IsThreat: 2 clases únicas: [0. 1.]
   IsProvocative: 2 clases únicas: [0. 1.]
   IsObscene: 2 clases únicas: [0. 1.]
   IsHatespeech: 2 clases únicas: [0. 1.]
   IsRacist: 2 clases únicas: [0. 1.]
   IsReligiousHate: 2 clases únicas: [0. 1.]
Entrenamiento: (800,)
Prueba: (200,)


In [None]:
# Celda 4: Ejecutar experimentos
tracker = MLflowModelTracker("notebook-experiments")

# Experimento 1: Logistic Regression
print("🔄 Entrenando Logistic Regression...")
lr_model, lr_vectorizer, lr_metrics = tracker.train_sklearn_model(
    X_train, X_test, y_train, y_test,
    model_type="logistic",
    max_iter=1000,
    C=1.0
)

# Experimento 2: Random Forest
print("🔄 Entrenando Random Forest...")
rf_model, rf_vectorizer, rf_metrics = tracker.train_sklearn_model(
    X_train, X_test, y_train, y_test,
    model_type="random_forest",
    n_estimators=100,
    max_depth=10
)

# Celda 5: Comparar resultados
results_df = pd.DataFrame([
    {"Modelo": "Logistic Regression", **lr_metrics},
    {"Modelo": "Random Forest", **rf_metrics}
])
print(results_df)

2025/07/02 15:02:15 INFO mlflow.tracking.fluent: Experiment with name 'notebook-experiments' does not exist. Creating a new experiment.


🔄 Entrenando Logistic Regression...




✅ logistic - Hamming Loss: 0.1363, Jaccard Score: 0.0809
🔄 Entrenando Random Forest...




✅ random_forest - Hamming Loss: 0.1494, Jaccard Score: 0.0370
                Modelo  hamming_loss  jaccard_score
0  Logistic Regression      0.136250       0.080893
1        Random Forest      0.149375       0.037026


: 