In [None]:
import gzip
import requests
from io import BytesIO, StringIO
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score


import mlflow
import mlflow.sklearn

In [None]:
GITHUB_CLEAN_URL = "https://raw.githubusercontent.com/Bootcamp-IA-P4/Bootcamp-IA-P4-project-x-nlp-team-3/feature/eda/Data/comments_data_clean.csv"

def load_comments_data_from_github(url):
    """
    Downloading and processing comments data from GitHub repository.
    """
    print("🔗 Downloading data from GitHub...")

    try:
        response = requests.get(url)
        response.raise_for_status()

        print("📊 Reading CSV file...")

        df = pd.read_csv(StringIO(response.text))

        print("✅ Data downloaded successfully!")
        return df

    except Exception as e:
        print(f"❌ Error while downloading data: {e}")
        return None

# Creating dataframe from GitHub URL
df = load_comments_data_from_github(GITHUB_CLEAN_URL)

🔗 Downloading data from GitHub...
❌ Error while downloading data: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/Bootcamp-IA-P4/Bootcamp-IA-P4-project-x-nlp-team-3/blob/feature/eda/Data/comments_data_clean.csv


In [None]:
TEXT_COLUMN = 'Text'      # Nombre correcto de la columna de texto
TARGET_COLUMN = 'IsToxic' # Cambia si tu columna objetivo tiene otro nombre

# Elimina columnas no deseadas
columnas_a_eliminar = [col for col in df.columns if col not in [TEXT_COLUMN, TARGET_COLUMN]]
df = df.drop(columns=columnas_a_eliminar)

# Vectorización del texto
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df[TEXT_COLUMN])
y = df[TARGET_COLUMN]

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run(run_name="NaiveBayes_Models_Multinominal"):
    # MultinomialNB
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_test)
    acc_mnb = accuracy_score(y_test, y_pred_mnb)
    f1_score_mnb = f1_score(y_test, y_pred_mnb, average='weighted')
    recall_mnb = recall_score(y_test, y_pred_mnb, average='weighted')
    precision_mnb = precision_score(y_test, y_pred_mnb, average='weighted')
    print("MultinomialNB entrenado correctamente:")
    mlflow.log_metric("accuracy", acc_mnb)
    mlflow.log_metric("f1_score", f1_score_mnb)
    mlflow.log_metric("recall", recall_mnb)
    mlflow.log_metric("precision", precision_mnb)

with mlflow.start_run(run_name="NaiveBayes_Models_Bernoulli"):
    # BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(X_train, y_train)
    y_pred_bnb = bnb.predict(X_test)
    acc_bnb = accuracy_score(y_test, y_pred_bnb)
    f1_score_gnb = f1_score(y_test, y_pred_mnb, average='weighted')
    recall_bnb = recall_score(y_test, y_pred_mnb, average='weighted')
    precision_bnb = precision_score(y_test, y_pred_mnb, average='weighted')
    print("BernoulliNB entrenado correctamente")
    mlflow.log_metric("accuracy", acc_bnb)
    mlflow.log_metric("f1_score", f1_score_gnb)
    mlflow.log_metric("recall", recall_bnb)
    mlflow.log_metric("precision", precision_bnb)

with mlflow.start_run(run_name="NaiveBayes_Models_Gaussian"):
     # GaussianNB requiere arrays densos
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()
    gnb = GaussianNB()
    gnb.fit(X_train_dense, y_train)
    y_pred_gnb = gnb.predict(X_test_dense)
    acc_gnb = accuracy_score(y_test, y_pred_gnb)
    f1_score_gnb = f1_score(y_test, y_pred_mnb, average='weighted')
    recall_gnb = recall_score(y_test, y_pred_mnb, average='weighted')
    precision_gnb = precision_score(y_test, y_pred_mnb, average='weighted')
    print("GaussianNB entrenado correctamente")
    mlflow.log_metric("accuracy", acc_gnb)
    mlflow.log_metric("f1_score", f1_score_gnb)
    mlflow.log_metric("recall", recall_gnb)
    mlflow.log_metric("precision", precision_gnb)