In [1]:
import sys
import os
sys.path.append('..')

from src.data_preprocessing import load_and_preprocess_data, prepare_train_test_split
from src.model_utils import MLflowModelTracker
import mlflow
import pandas as pd

In [2]:
# Celda 2: Cargar y explorar datos
df, toxicity_columns = load_and_preprocess_data('../data/raw/hatespeech.csv')
print(f"Dataset cargado: {df.shape}")
print(f"Columnas de toxicidad: {toxicity_columns}")

🔍 Analizando distribución de clases...
  ✅ IsToxic: 1140 positivos, 538 negativos
  ✅ IsAbusive: 348 positivos, 1330 negativos
  ✅ IsThreat: 20 positivos, 1658 negativos
  ✅ IsProvocative: 157 positivos, 1521 negativos
  ✅ IsObscene: 100 positivos, 1578 negativos
  ✅ IsHatespeech: 138 positivos, 1540 negativos
  ✅ IsRacist: 125 positivos, 1553 negativos
  ✅ IsNationalist: 122 positivos, 1556 negativos
  ✅ IsSexist: 178 positivos, 1500 negativos
  ✅ IsHomophobic: 142 positivos, 1536 negativos
  ✅ IsReligiousHate: 95 positivos, 1583 negativos
  ✅ IsRadicalism: 167 positivos, 1511 negativos

📊 Columnas válidas para ML: 12
   ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism']
Dataset cargado: (1678, 16)
Columnas de toxicidad: ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsR

In [3]:
# Celda 3: Preparar datos
X_train, X_test, y_train, y_test = prepare_train_test_split(df, toxicity_columns)
print(f"Entrenamiento: {X_train.shape}")
print(f"Prueba: {X_test.shape}")


🎯 Verificación final de datos:
   Forma de X: (1678,)
   Forma de y: (1678, 12)
   Rango de y: 0.0 - 1.0
   IsToxic: 2 clases únicas: [0. 1.]
   IsAbusive: 2 clases únicas: [0. 1.]
   IsThreat: 2 clases únicas: [0. 1.]
   IsProvocative: 2 clases únicas: [0. 1.]
   IsObscene: 2 clases únicas: [0. 1.]
   IsHatespeech: 2 clases únicas: [0. 1.]
   IsRacist: 2 clases únicas: [0. 1.]
   IsNationalist: 2 clases únicas: [0. 1.]
   IsSexist: 2 clases únicas: [0. 1.]
   IsHomophobic: 2 clases únicas: [0. 1.]
   IsReligiousHate: 2 clases únicas: [0. 1.]
   IsRadicalism: 2 clases únicas: [0. 1.]
Entrenamiento: (1342,)
Prueba: (336,)


In [4]:
# Celda 4: Ejecutar experimentos
tracker = MLflowModelTracker("notebook-experiments")

# Experimento 1: Logistic Regression
print("🔄 Entrenando Logistic Regression...")
lr_model, lr_vectorizer, lr_metrics = tracker.train_sklearn_model(
    X_train, X_test, y_train, y_test,
    model_type="logistic",
    max_iter=1000,
    C=1.0
)

# Experimento 2: Random Forest
print("🔄 Entrenando Random Forest...")
rf_model, rf_vectorizer, rf_metrics = tracker.train_sklearn_model(
    X_train, X_test, y_train, y_test,
    model_type="random_forest",
    n_estimators=100,
    max_depth=10
)

# Celda 5: Comparar resultados
results_df = pd.DataFrame([
    {"Modelo": "Logistic Regression", **lr_metrics},
    {"Modelo": "Random Forest", **rf_metrics}
])
print(results_df)

2025/07/03 11:01:43 INFO mlflow.tracking.fluent: Experiment with name 'notebook-experiments' does not exist. Creating a new experiment.


🔄 Entrenando Logistic Regression...




✅ logistic - Hamming Loss: 0.0702, Jaccard Score: 0.3094
🔄 Entrenando Random Forest...




✅ random_forest - Hamming Loss: 0.1042, Jaccard Score: 0.0590
                Modelo  hamming_loss  jaccard_score
0  Logistic Regression      0.070188       0.309448
1        Random Forest      0.104167       0.059031
