In [6]:
import gzip
import requests
from io import BytesIO, StringIO
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

import joblib

import mlflow
import mlflow.sklearn

In [7]:
GITHUB_CLEAN_URL = "https://raw.githubusercontent.com/Bootcamp-IA-P4/Bootcamp-IA-P4-project-x-nlp-team-3/feature/eda/Data/comments_data_clean.csv"

def load_comments_data_from_github(url):
    """
    Downloading and processing comments data from GitHub repository.
    """
    print("🔗 Downloading data from GitHub...")

    try:
        response = requests.get(url)
        response.raise_for_status()

        print("📊 Reading CSV file...")

        df = pd.read_csv(StringIO(response.text))

        print("✅ Data downloaded successfully!")
        return df

    except Exception as e:
        print(f"❌ Error while downloading data: {e}")
        return None

# Creating dataframe from GitHub URL
df = load_comments_data_from_github(GITHUB_CLEAN_URL)

🔗 Downloading data from GitHub...
📊 Reading CSV file...
✅ Data downloaded successfully!


In [12]:
# Preprocesamiento
TEXT_COLUMN = 'Text'      # Cambia si tu columna de texto tiene otro nombre
TARGET_COLUMN = 'IsToxic' # Cambia si tu columna objetivo tiene otro nombre

# Vectorización del texto
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df[TEXT_COLUMN])
y = df[TARGET_COLUMN]

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optimización de hiperparámetros para MultinomialNB y BernoulliNB
alphas = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]

# MultinomialNB
param_grid_mnb = {'alpha': alphas}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=5, scoring='f1_weighted')
grid_mnb.fit(X_train, y_train)
best_alpha_mnb = grid_mnb.best_params_['alpha']
print(f"Mejor alpha MultinomialNB: {best_alpha_mnb}")
print(f"Mejor F1 MultinomialNB: {grid_mnb.best_score_:.4f}")

# BernoulliNB
param_grid_bnb = {'alpha': alphas}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=5, scoring='f1_weighted')
grid_bnb.fit(X_train, y_train)
best_alpha_bnb = grid_bnb.best_params_['alpha']
print(f"Mejor alpha BernoulliNB: {best_alpha_bnb}")
print(f"Mejor F1 BernoulliNB: {grid_bnb.best_score_:.4f}")

# GaussianNB (requiere arrays densos)
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()
param_grid_gnb = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=5, scoring='f1_weighted')
grid_gnb.fit(X_train_dense, y_train)
best_vs_gnb = grid_gnb.best_params_['var_smoothing']
print(f"Mejor var_smoothing GaussianNB: {best_vs_gnb}")
print(f"Mejor F1 GaussianNB: {grid_gnb.best_score_:.4f}")

# Entrenamiento y evaluación final con los mejores hiperparámetros
with mlflow.start_run(run_name="NaiveBayes_Models_Multinominal_Optimizado"):
    mnb = MultinomialNB(alpha=best_alpha_mnb)
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_test)
    acc_mnb = accuracy_score(y_test, y_pred_mnb)
    f1_score_mnb = f1_score(y_test, y_pred_mnb, average='weighted')
    recall_mnb = recall_score(y_test, y_pred_mnb, average='weighted')
    precision_mnb = precision_score(y_test, y_pred_mnb, average='weighted')
    print("MultinomialNB optimizado entrenado correctamente:")
    print(f"Accuracy: {acc_mnb:.4f} | F1: {f1_score_mnb:.4f} | Recall: {recall_mnb:.4f} | Precision: {precision_mnb:.4f}")

    # Evaluación en entrenamiento y prueba para MultinomialNB
    train_pred_mnb = mnb.predict(X_train)
    test_pred_mnb = mnb.predict(X_test)
    train_f1_mnb = f1_score(y_train, train_pred_mnb, average='weighted')
    test_f1_mnb = f1_score(y_test, test_pred_mnb, average='weighted')
    overfit_f1_mnb = train_f1_mnb - test_f1_mnb
    print(f"MultinomialNB F1 Entrenamiento: {train_f1_mnb:.4f} | F1 Prueba: {test_f1_mnb:.4f}")

    mlflow.log_param("alpha", best_alpha_mnb)
    mlflow.log_metric("accuracy", acc_mnb)
    mlflow.log_metric("f1_score", f1_score_mnb)
    mlflow.log_metric("recall", recall_mnb)
    mlflow.log_metric("precision", precision_mnb)
    mlflow.log_metric("overfit_f1", overfit_f1_mnb)

with mlflow.start_run(run_name="NaiveBayes_Models_Bernoulli_Optimizado"):
    bnb = BernoulliNB(alpha=best_alpha_bnb)
    bnb.fit(X_train, y_train)
    y_pred_bnb = bnb.predict(X_test)
    acc_bnb = accuracy_score(y_test, y_pred_bnb)
    f1_score_bnb = f1_score(y_test, y_pred_bnb, average='weighted')
    recall_bnb = recall_score(y_test, y_pred_bnb, average='weighted')
    precision_bnb = precision_score(y_test, y_pred_bnb, average='weighted')
    print("BernoulliNB optimizado entrenado correctamente:")
    print(f"Accuracy: {acc_bnb:.4f} | F1: {f1_score_bnb:.4f} | Recall: {recall_bnb:.4f} | Precision: {precision_bnb:.4f}")

    train_pred_bnb = bnb.predict(X_train)
    test_pred_bnb = bnb.predict(X_test)
    train_f1_bnb = f1_score(y_train, train_pred_bnb, average='weighted')
    test_f1_bnb = f1_score(y_test, test_pred_bnb, average='weighted')
    overfit_f1_bnb = train_f1_bnb - test_f1_bnb
    print(f"BernoulliNB F1 Entrenamiento: {train_f1_mnb:.4f} | F1 Prueba: {test_f1_mnb:.4f}")

    mlflow.log_param("alpha", best_alpha_bnb)
    mlflow.log_metric("accuracy", acc_bnb)
    mlflow.log_metric("f1_score", f1_score_bnb)
    mlflow.log_metric("recall", recall_bnb)
    mlflow.log_metric("precision", precision_bnb)
    mlflow.log_metric("overfit_f1", overfit_f1_bnb)

with mlflow.start_run(run_name="NaiveBayes_Models_Gaussian_Optimizado"):
    gnb = GaussianNB(var_smoothing=best_vs_gnb)
    gnb.fit(X_train_dense, y_train)
    y_pred_gnb = gnb.predict(X_test_dense)
    acc_gnb = accuracy_score(y_test, y_pred_gnb)
    f1_score_gnb = f1_score(y_test, y_pred_gnb, average='weighted')
    recall_gnb = recall_score(y_test, y_pred_gnb, average='weighted')
    precision_gnb = precision_score(y_test, y_pred_gnb, average='weighted')
    print("GaussianNB optimizado entrenado correctamente:")
    print(f"Accuracy: {acc_gnb:.4f} | F1: {f1_score_gnb:.4f} | Recall: {recall_gnb:.4f} | Precision: {precision_gnb:.4f}")

    #Evaluación en entrenamiento y prueba para GaussianNB
    train_pred_gnb = gnb.predict(X_train_dense)
    test_pred_gnb = gnb.predict(X_test_dense)
    train_f1_gnb = f1_score(y_train, train_pred_gnb, average='weighted')
    test_f1_gnb = f1_score(y_test, test_pred_gnb, average='weighted')
    overfit_f1_gnb = train_f1_gnb - test_f1_gnb
    print(f"GaussianNB F1 Entrenamiento: {train_f1_gnb:.4f} | F1 Prueba: {test_f1_gnb:.4f}")

    mlflow.log_param("var_smoothing", best_vs_gnb)
    mlflow.log_metric("accuracy", acc_gnb)
    mlflow.log_metric("f1_score", f1_score_gnb)
    mlflow.log_metric("recall", recall_gnb)
    mlflow.log_metric("precision", precision_gnb)
    mlflow.log_metric("overfit_f1", overfit_f1_gnb)


Mejor alpha MultinomialNB: 1.0
Mejor F1 MultinomialNB: 0.6795
Mejor alpha BernoulliNB: 0.5
Mejor F1 BernoulliNB: 0.6830
Mejor var_smoothing GaussianNB: 1e-09
Mejor F1 GaussianNB: 0.5430
MultinomialNB optimizado entrenado correctamente:
Accuracy: 0.7650 | F1: 0.7649 | Recall: 0.7650 | Precision: 0.7713
MultinomialNB F1 Entrenamiento: 0.9410 | F1 Prueba: 0.7649
BernoulliNB optimizado entrenado correctamente:
Accuracy: 0.7400 | F1: 0.7398 | Recall: 0.7400 | Precision: 0.7469
BernoulliNB F1 Entrenamiento: 0.9410 | F1 Prueba: 0.7649
GaussianNB optimizado entrenado correctamente:
Accuracy: 0.6000 | F1: 0.5897 | Recall: 0.6000 | Precision: 0.6006
GaussianNB F1 Entrenamiento: 0.9023 | F1 Prueba: 0.5897


In [9]:
from sklearn.pipeline import Pipeline

# Crear un pipeline que incluya la vectorización y el modelo MultinomialNB optimizado
pipeline_mnb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB(alpha=best_alpha_mnb))
])

# Entrenar el pipeline con los datos originales de texto
pipeline_mnb.fit(df[TEXT_COLUMN], df[TARGET_COLUMN])

# Guardar el pipeline entrenado en un archivo .pkl
joblib.dump(pipeline_mnb, 'pipeline_multinomial_nb.pkl')
print("Pipeline MultinomialNB guardado como pipeline_multinomial_nb.pkl")

Pipeline MultinomialNB guardado como pipeline_multinomial_nb.pkl


In [10]:
# Entrena el modelo MultinomialNB con los mejores hiperparámetros (ajusta si usaste GridSearchCV)
best_mnb = MultinomialNB(alpha=1.0)  # Cambia alpha si tienes el mejor valor de GridSearchCV
best_mnb.fit(X_train, y_train)

# Guarda el modelo entrenado en un archivo .pkl
joblib.dump(best_mnb, 'multinomial_nb_model.pkl')
print("Modelo MultinomialNB guardado como multinomial_nb_model.pkl")

Modelo MultinomialNB guardado como multinomial_nb_model.pkl
