In [35]:
# Importar librerías mágicas
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, f1_score, classification_report
import optuna
import joblib
import xgboost as xgb
import os

In [36]:
df=pd.read_csv("https://raw.githubusercontent.com/fintihlupik/NLP-sentimental/refs/heads/master/data/youtoxic_english_1000%20-%20youtoxic_english_1000.csv")


In [37]:
# Definir directorios
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
data_dir = os.path.join(BASE_DIR, 'processed')  # Corregido
models_dir = os.path.join(BASE_DIR, 'models')

# Cargar datos vectorizados
X_train = joblib.load(os.path.join(data_dir, 'X_train_tfidf.pkl'))
X_test = joblib.load(os.path.join(data_dir, 'X_test_tfidf.pkl'))

# Cargar etiquetas
y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))['IsToxic']
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))['IsToxic']

In [38]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, f1_score, classification_report

# Configuración de validación cruzada estratificada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelo inicial con Naive Bayes
nb_model = MultinomialNB()

# Evaluación inicial con validación cruzada
y_pred_proba = cross_val_predict(nb_model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]

# Calcular métricas de evaluación
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
f1_scores = 2 * recall * precision / (recall + precision + 1e-6)

# Imprimir resultados iniciales
print(f"F1-Score inicial: {max(f1_scores):.4f}")

# Reporte de clasificación inicial
optimal_threshold = thresholds[np.argmax(f1_scores)]
y_pred = (y_pred_proba >= optimal_threshold).astype(int)
print("Reporte de clasificación inicial:")
print(classification_report(y_train, y_pred))

F1-Score inicial: 0.7077
Reporte de clasificación inicial:
              precision    recall  f1-score   support

           0       0.85      0.42      0.56       430
           1       0.58      0.92      0.71       370

    accuracy                           0.65       800
   macro avg       0.72      0.67      0.64       800
weighted avg       0.73      0.65      0.63       800



In [39]:
import optuna

# Definir función objetivo para Optuna
def objective(trial):
    # Sugerir un valor para el hiperparámetro alpha
    alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
    
    # Crear el modelo con el valor de alpha sugerido
    model = MultinomialNB(alpha=alpha)
    
    # Validación cruzada estratificada
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]
    
    # Calcular F1-score
    precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
    f1_scores = 2 * recall * precision / (recall + precision + 1e-6)
    
    # Retornar el mejor F1-score
    return max(f1_scores)

# Crear un estudio de Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Ejecutar 50 pruebas

# Obtener el mejor hiperparámetro
best_alpha = study.best_params['alpha']
print(f"Mejor valor de alpha: {best_alpha}")

[I 2025-07-07 14:10:29,516] A new study created in memory with name: no-name-b9de55ed-df98-471d-bf23-2d9bb59aa9e9
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-07 14:10:29,747] Trial 0 finished with value: 0.6951359250209992 and parameters: {'alpha': 0.0049281114939941926}. Best is trial 0 with value: 0.6951359250209992.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-07 14:10:29,989] Trial 1 finished with value: 0.7073469501514792 and parameters: {'alpha': 7.600564748763602}. Best is trial 1 with value: 0.7073469501514792.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-07 14:10:30,156] Trial 2 finished with value: 0.7113767519814103 and parameters: {'alpha': 0.11678677562935426}. Best is trial 2 with value: 0.7113767519814103.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 10.0)
[I 2025-07-07 14:10:30,360] Trial 3 finished with value: 0.7061606450330247 and parameters: {'alpha': 0.011845194122068805}. Best is trial 2 with 

Mejor valor de alpha: 0.48942713832142165


In [40]:
# Entrenar el modelo con el mejor valor de alpha encontrado por Optuna
best_model = MultinomialNB(alpha=best_alpha)
best_model.fit(X_train, y_train)

# Obtener las probabilidades predichas en el conjunto de entrenamiento
y_pred_proba = best_model.predict_proba(X_train)[:, 1]

# Calcular precisión, recall y F1-score para diferentes umbrales
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_proba)
f1_scores = 2 * recall * precision / (recall + precision + 1e-6)

# Encontrar el umbral que maximiza el F1-score
optimal_threshold = thresholds[np.argmax(f1_scores)]
print(f"Mejor umbral para F1-score: {optimal_threshold:.4f}")

# Evaluar el modelo con el umbral óptimo
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
print("Reporte de clasificación con umbral optimizado:")
print(classification_report(y_train, y_pred_optimal))

2025/07/07 14:10:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'db5ec86d8c1b471489e2b42f7b65179a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run silent-pig-636 at: http://localhost:5000/#/experiments/168441151506597017/runs/db5ec86d8c1b471489e2b42f7b65179a
🧪 View experiment at: http://localhost:5000/#/experiments/168441151506597017
Mejor umbral para F1-score: 0.4902
Reporte de clasificación con umbral optimizado:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       430
           1       0.96      0.98      0.97       370

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800



In [41]:
from sklearn.metrics import accuracy_score

# 1. Evaluación con el modelo inicial (umbral predeterminado de 0.5)
initial_model = MultinomialNB()
initial_model.fit(X_train, y_train)
y_pred_initial = initial_model.predict(X_train)
initial_f1 = f1_score(y_train, y_pred_initial)
initial_accuracy = accuracy_score(y_train, y_pred_initial)

# 2. Evaluación con el modelo optimizado (mejor alpha, umbral predeterminado de 0.5)
optimized_model = MultinomialNB(alpha=best_alpha)
optimized_model.fit(X_train, y_train)
y_pred_optimized = optimized_model.predict(X_train)
optimized_f1 = f1_score(y_train, y_pred_optimized)
optimized_accuracy = accuracy_score(y_train, y_pred_optimized)

# 3. Evaluación con el modelo optimizado y el mejor umbral
y_pred_best_threshold = (y_pred_proba >= optimal_threshold).astype(int)
best_threshold_f1 = f1_score(y_train, y_pred_best_threshold)
best_threshold_accuracy = accuracy_score(y_train, y_pred_best_threshold)

# Comparación de métricas
print("Comparación de métricas:")
print(f"{'Modelo':<30}{'F1-Score':<15}{'Accuracy':<15}")
print(f"{'Modelo inicial (umbral 0.5)':<30}{initial_f1:<15.4f}{initial_accuracy:<15.4f}")
print(f"{'Modelo optimizado (umbral 0.5)':<30}{optimized_f1:<15.4f}{optimized_accuracy:<15.4f}")
print(f"{'Modelo optimizado (mejor umbral)':<30}{best_threshold_f1:<15.4f}{best_threshold_accuracy:<15.4f}")

2025/07/07 14:11:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '42fce0316cf645b1ba71843bb94394c5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run amusing-grouse-227 at: http://localhost:5000/#/experiments/168441151506597017/runs/42fce0316cf645b1ba71843bb94394c5
🧪 View experiment at: http://localhost:5000/#/experiments/168441151506597017


2025/07/07 14:11:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '228f46f7150e4eaba316183c90b65cac', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sincere-sow-120 at: http://localhost:5000/#/experiments/168441151506597017/runs/228f46f7150e4eaba316183c90b65cac
🧪 View experiment at: http://localhost:5000/#/experiments/168441151506597017
Comparación de métricas:
Modelo                        F1-Score       Accuracy       
Modelo inicial (umbral 0.5)   0.9392         0.9450         
Modelo optimizado (umbral 0.5)0.9636         0.9663         
Modelo optimizado (mejor umbral)0.9665         0.9688         


In [42]:
# Selección del mejor modelo según F1-score
if best_threshold_f1 >= max(initial_f1, optimized_f1):
    final_model = optimized_model
    final_threshold = optimal_threshold
    print("El modelo optimizado con el mejor umbral fue seleccionado como el mejor modelo.")
else:
    final_model = optimized_model if optimized_f1 > initial_f1 else initial_model
    final_threshold = 0.5
    print("El modelo optimizado o inicial con umbral 0.5 fue seleccionado como el mejor modelo.")

# Entrenar el modelo final con los datos de entrenamiento completos
final_model.fit(X_train, y_train)

# Evaluar el modelo final en el conjunto de prueba
y_test_proba = final_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= final_threshold).astype(int)

# Reporte de clasificación en el conjunto de prueba
from sklearn.metrics import classification_report
print("Reporte de clasificación en el conjunto de prueba:")
print(classification_report(y_test, y_test_pred))

El modelo optimizado con el mejor umbral fue seleccionado como el mejor modelo.


2025/07/07 14:11:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '69c593107026418e8b8eca2e3b1d5b08', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run smiling-gnu-408 at: http://localhost:5000/#/experiments/168441151506597017/runs/69c593107026418e8b8eca2e3b1d5b08
🧪 View experiment at: http://localhost:5000/#/experiments/168441151506597017
Reporte de clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.68      0.73      0.70       108
           1       0.65      0.59      0.62        92

    accuracy                           0.67       200
   macro avg       0.66      0.66      0.66       200
weighted avg       0.66      0.67      0.66       200



In [43]:
models_dir = os.path.join(BASE_DIR, 'models')
print(f"Ruta de models_dir: {models_dir}")

Ruta de models_dir: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models


In [44]:
import joblib
import os

# Ajustar la ruta de models_dir para que apunte fuera de 'data'
models_dir = os.path.join(BASE_DIR, 'models')  # Asegúrate de que apunta al directorio correcto

# Crear la carpeta 'models' si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Carpeta 'models' creada en: {models_dir}")
else:
    print(f"Usando la carpeta existente: {models_dir}")

# Ruta para guardar el modelo
model_path = os.path.join(models_dir, 'naive_bayes_best_model.pkl')

# Guardar el modelo final
joblib.dump(final_model, model_path)
print(f"Modelo guardado en: {model_path}")

Usando la carpeta existente: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models
Modelo guardado en: c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\data\models\naive_bayes_best_model.pkl


In [51]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score, recall_score
)
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import SMOTE
from sklearn.base import clone

# ✅ Llamada correcta a autolog
mlflow.autolog(log_models=False)

# Resto del código...
 # Logueamos modelos manualmente

# MLflow config
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Full Model Benchmarking")

# Cargar datos
df = pd.read_csv("data/dataset.csv")
X = df.drop("stroke", axis=1)
y = df["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Hiperparámetros
params = {
    "RandomForestClassifier": {'n_estimators': [135], 'max_depth': [15], 'min_samples_split': [7],
                               'min_samples_leaf': [6], 'max_features': ['sqrt'], 'bootstrap': [True], 'random_state': [42]},
    "KNeighborsClassifier": {'n_neighbors': [7], 'weights': ['uniform'], 'metric': ['minkowski']},
    "GradientBoostingClassifier": {'n_estimators': [110], 'learning_rate': [0.5], 'max_depth': [15],
                                   'min_samples_split': [8], 'min_samples_leaf': [6], 'subsample': [1],
                                   'max_features': ['sqrt'], 'random_state': [42]},
    "LGBMClassifier": {'n_estimators': [125], 'learning_rate': [0.1], 'num_leaves': [31], 'max_depth': [-1],
                       'min_child_samples': [20], 'subsample': [1], 'colsample_bytree': [1], 'random_state': [42]},
    "XGBClassifier": {'n_estimators': [125], 'learning_rate': [1], 'max_depth': [16], 'subsample': [1],
                      'colsample_bytree': [1], 'gamma': [0], 'reg_alpha': [0.1], 'reg_lambda': [1],
                      'use_label_encoder': [False], 'eval_metric': ['logloss'], 'random_state': [42]},
    "LogisticRegression": {'C': [1.0], 'max_iter': [100], 'solver': ['lbfgs'], 'random_state': [42]},
    "GaussianNB": [{}],
    "SVC": {'C': [1.0], 'kernel': ['rbf'], 'probability': [True], 'random_state': [42]},
    "ExtraTreesClassifier": {'n_estimators': [100], 'max_depth': [10], 'random_state': [42]}
}

# Modelos
from sklearn.base import clone

modelos = {
    "LogisticRegression": LogisticRegression(),
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
    "LGBMClassifier": LGBMClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "GaussianNB": GaussianNB(),
    "SVC": SVC(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),
}

# Entrenamiento y tracking
for nombre, modelo in modelos.items():
    grid = params.get(nombre, [{}])
    for param_set in ParameterGrid(grid):
        m = clone(modelo)
        m.set_params(**param_set)

        with mlflow.start_run(run_name=nombre):
            print(f"Entrenando {nombre} con parámetros: {param_set}")
            m.fit(X_train_res, y_train_res)

            y_train_pred = m.predict(X_train_res)
            y_test_pred = m.predict(X_test)

            acc_train = accuracy_score(y_train_res, y_train_pred)
            acc_test = accuracy_score(y_test, y_test_pred)
            f1_train = f1_score(y_train_res, y_train_pred, average='weighted')
            f1_test = f1_score(y_test, y_test_pred, average='weighted')
            acc_gap = acc_train - acc_test
            f1_gap = f1_train - f1_test

            report = classification_report(y_test, y_test_pred, output_dict=True)
            recall_0 = report.get('0.0', {}).get('recall', 0)
            recall_1 = report.get('1.0', {}).get('recall', 0)

            # MLflow logs
            mlflow.log_param("modelo", nombre)
            mlflow.log_params(param_set)
            mlflow.log_metric("accuracy_train", acc_train)
            mlflow.log_metric("accuracy_test", acc_test)
            mlflow.log_metric("f1_train", f1_train)
            mlflow.log_metric("f1_test", f1_test)
            mlflow.log_metric("acc_gap", acc_gap)
            mlflow.log_metric("f1_gap", f1_gap)
            mlflow.log_metric("recall_0", recall_0)
            mlflow.log_metric("recall_1", recall_1)

            # Guardar el modelo
            mlflow.sklearn.log_model(m, artifact_path="modelo")



ImportError: cannot import name 'SPARSE_ARRAY_PRESENT' from 'sklearn.utils.fixes' (c:\Users\Administrator\Desktop\proyecto10\nlp_grupo_5_proyecto_10\.venv\Lib\site-packages\sklearn\utils\fixes.py)

In [31]:
import psutil
import mlflow

with mlflow.start_run():
    # ... tu código del modelo ...

    ram_usage = psutil.virtual_memory().used / (1024 ** 3)  # en GB
    cpu_usage = psutil.cpu_percent()

    mlflow.log_metric("ram_used_gb", ram_usage)
    mlflow.log_metric("cpu_percent", cpu_usage)


🏃 View run popular-pig-977 at: http://localhost:5000/#/experiments/168441151506597017/runs/e866290e5a28448e8011f8c5a706787f
🧪 View experiment at: http://localhost:5000/#/experiments/168441151506597017
