# ðŸ“Š MLFlow Tracking - Experimentos de Hate Speech Detection

Este notebook demuestra cÃ³mo usar MLFlow para trackear experimentos de machine learning.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

# AÃ±adir src al path
project_root = Path('../').resolve()
sys.path.append(str(project_root / 'src'))

from models.train import train_model
from models.evaluate import evaluate_model
from features.vectorization import load_vectorized_data
from utils.mlflow_tracking import get_tracker


## 1. Cargar Datos Vectorizados


In [None]:
# Cargar datos vectorizados
X_train, X_test, y_train, y_test = load_vectorized_data(
    input_dir=Path('../data/processed'),
    prefix='tfidf'
)

print(f"âœ… Datos cargados:")
print(f"   Train: {X_train.shape}")
print(f"   Test: {X_test.shape}")


## 2. Inicializar MLFlow Tracker


In [None]:
# Inicializar tracker de MLFlow
tracker = get_tracker(experiment_name="hate_speech_detection")
print(f"âœ… MLFlow tracker inicializado: {tracker.experiment_name}")


## 3. Entrenar y Registrar Modelos


In [None]:
# Entrenar mÃºltiples modelos y registrarlos en MLFlow
models_to_test = [
    {'name': 'svm', 'type': 'svm', 'params': {'C': 0.056, 'kernel': 'linear', 'class_weight': 'balanced'}},
    {'name': 'logistic', 'type': 'logistic', 'params': {'C': 0.1, 'penalty': 'l2', 'class_weight': 'balanced', 'max_iter': 1000}},
    {'name': 'naive_bayes', 'type': 'naive_bayes', 'params': {'alpha': 10.0}},
    {'name': 'random_forest', 'type': 'random_forest', 'params': {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 5, 'class_weight': 'balanced'}}
]

results = {}

for model_config in models_to_test:
    print(f"\nðŸ”§ Entrenando {model_config['name']}...")
    
    # Entrenar modelo
    model = train_model(
        X_train, y_train,
        model_type=model_config['type'],
        **model_config['params']
    )
    
    # Evaluar modelo
    metrics = evaluate_model(
        model, X_train, X_test, y_train, y_test, verbose=False
    )
    
    results[model_config['name']] = metrics
    
    # Registrar en MLFlow
    tracker.log_model_training(
        model=model,
        model_name=model_config['name'],
        metrics=metrics,
        params=model_config['params'],
        vectorizer_type='tfidf',
        tags={'experiment': 'model_comparison', 'vectorizer': 'tfidf'}
    )
    
    print(f"âœ… {model_config['name']} registrado en MLFlow")
    print(f"   F1-score (test): {metrics['test_f1']:.4f}")
    print(f"   Overfitting: {metrics['diff_f1']:.2f}%")


In [None]:
# Crear DataFrame con resultados
comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append({
        'Modelo': model_name,
        'F1 (test)': metrics['test_f1'],
        'F1 (train)': metrics['train_f1'],
        'Overfitting (%)': metrics['diff_f1'],
        'Accuracy (test)': metrics['test_accuracy'],
        'Precision (test)': metrics['test_precision'],
        'Recall (test)': metrics['test_recall']
    })

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.sort_values('F1 (test)', ascending=False)

print("\nðŸ“Š ComparaciÃ³n de Modelos:")
print(df_comparison.to_string(index=False))


## 5. Visualizar en MLFlow UI

Para ver los experimentos en la interfaz de MLFlow:

```bash
mlflow ui
```

Luego abre: http://localhost:5000
