The objectiv of this notebook is to run BOW and Naive Bayes as our baseline model to have a reference in our experimentation


## Libraries

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import mlflow
import mlflow.sklearn


In [2]:
train = pd.read_csv("../data/gold/train.csv")
test = pd.read_csv("../data/gold/test.csv")

X_train = train['features']
y_train = train['target']  
X_test = test['features']
y_test = test['target']


In [3]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts_local

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.search_experiments()

[<Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/1', creation_time=1732985523797, experiment_id='1', last_update_time=1732985523797, lifecycle_stage='active', name='MNB-BaseModel', tags={'description': 'Base model with BOW + Multinomial Naive Bayes.'}>,
 <Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/0', creation_time=1732985519645, experiment_id='0', last_update_time=1732985519645, lifecycle_stage='active', name='Default', tags={}>]

In [4]:
print (f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [None]:
def train_model(X_train, y_train, X_test, y_test):
    # Establecer el experimento
    mlflow.set_experiment("spam-detector")
    mlflow.set_experiment_tag("description", "Base model with BOW + Multinomial Naive Bayes.")

    
    with mlflow.start_run(run_name="bow_naiveBayes") as run:
        # Registrar parámetros generales
        mlflow.log_param("data_folder", "../data/gold/")
        mlflow.log_param("train_file", "train.csv")
        mlflow.log_param("test_file", "test.csv")
        mlflow.log_param("vectorizer_type", "CountVectorizer")
        mlflow.log_param("model_type", "MultinomialNB")
        
        # Crear vectorizador y modelo en pipeline
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=2000)),
            ('classifier', MultinomialNB())
        ])
        
        # Entrenar el modelo
        pipeline.fit(X_train, y_train)
        
        # Predecir en el conjunto de prueba
        y_test_pred = pipeline.predict(X_test)
        y_test_prob = pipeline.predict_proba(X_test)[:, 1]
        
        # Calcular métricas
        test_accuracy = accuracy_score(y_test, y_test_pred)
        roc_auc = roc_auc_score(y_test, y_test_prob)
        fpr, tpr, _ = roc_curve(y_test, y_test_prob)
        
        # Mostrar resultados
        print("Test Accuracy:", test_accuracy)
        print("\nClassification Report (Test):")
        print(classification_report(y_test, y_test_pred))
        
        # Guardar la curva ROC
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend(loc="lower right")
        plt.close()
        
        # Registrar artefactos y métricas
        mlflow.log_artifact("roc_curve.png")
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("roc_auc", roc_auc)
        
        # Registrar el pipeline completo en MLflow
        mlflow.sklearn.log_model(pipeline, "pipeline")
        
        print(f"Pipeline logged to MLflow under run ID {run.info.run_id}")

In [6]:
train_model(X_train, y_train, X_test, y_test)

Test Accuracy: 0.9689922480620154

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       453
           1       0.82      0.95      0.88        63

    accuracy                           0.97       516
   macro avg       0.91      0.96      0.93       516
weighted avg       0.97      0.97      0.97       516

🏃 View run bow_naiveBayes at: http://127.0.0.1:5000/#/experiments/1/runs/efe283ea66e24b2f9e1ce7ffafe05a51
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


FileNotFoundError: [Errno 2] No such file or directory: 'roc_curve.png'

In [None]:
mlflow experiments list
