The objectiv of this notebook is to run BOW and Naive Bayes as our baseline model to have a reference in our experimentation


## Libraries

In [8]:
import pandas as pd
import os
from joblib import dump
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, log_loss
import mlflow


In [9]:
train = pd.read_csv("../data/gold/train.csv")
test = pd.read_csv("../data/gold/test.csv")

X_train = train['features']
y_train = train['target']  
X_test = test['features']
y_test = test['target']


In [10]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts_local

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.search_experiments()

[<Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/1', creation_time=1732894073349, experiment_id='1', last_update_time=1732894073349, lifecycle_stage='active', name='MultinomialNaiveBayes-BaseModel', tags={}>,
 <Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/0', creation_time=1732892867190, experiment_id='0', last_update_time=1732892867190, lifecycle_stage='active', name='Default', tags={}>]

In [11]:
print (f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [12]:
mlflow.search_experiments()

[<Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/1', creation_time=1732894073349, experiment_id='1', last_update_time=1732894073349, lifecycle_stage='active', name='MultinomialNaiveBayes-BaseModel', tags={}>,
 <Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/0', creation_time=1732892867190, experiment_id='0', last_update_time=1732892867190, lifecycle_stage='active', name='Default', tags={}>]

In [13]:
def train_model(X_train, y_train, X_test, y_test):
    mlflow.set_experiment("MultinomialNaiveBayes-BaseModel")
        
    with mlflow.start_run() as run:
        mlflow.log_param("data_folder", "../data/gold/")
        mlflow.log_param("train_file", "train.csv")
        mlflow.log_param("test_file", "test.csv")
        
        vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=2000)
        X_train_bow = vectorizer.fit_transform(X_train)

        X_test_bow = vectorizer.transform(X_test)
        
        model = MultinomialNB()
        model.fit(X_train_bow, y_train)

        y_test_pred = model.predict(X_test_bow)
        y_test_prob = model.predict_proba(X_test_bow)[:, 1]  
        
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        print("Test Accuracy:", test_accuracy)
        print("\nClassification Report (Test):")
        print(classification_report(y_test, y_test_pred))
        
        roc_auc = roc_auc_score(y_test, y_test_prob)
        fpr, tpr, _ = roc_curve(y_test, y_test_prob)
        
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend(loc="lower right")
        plt.close()
        

        mlflow.log_artifact("artifacts_local", "roc_curve.png")
        
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_param("vectorizer_type", "CountVectorizer")
        mlflow.log_param("model_type", "MultinomialNB")
        

        mlflow.log_artifact("artifacts_local", "vectorizer.joblib")
        mlflow.log_artifact(("artifacts_local", "naive_bayes_model.joblib"))
        
        print(f"Model and vectorizer logged to MLflow under run ID {run.info.run_id}")


In [14]:
train_model(X_train, y_train, X_test, y_test)

Test Accuracy: 0.9689922480620154

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       453
           1       0.82      0.95      0.88        63

    accuracy                           0.97       516
   macro avg       0.91      0.96      0.93       516
weighted avg       0.97      0.97      0.97       516

🏃 View run powerful-hound-948 at: http://127.0.0.1:5000/#/experiments/1/runs/957c0326ea404d55920347cd11655781
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Error: [('artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png', '/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png', "[Errno 36] File name too long: '/home/maldu/dscience/projects/spam_detector/notebooks/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png/artifacts_local/1/957c0326ea404d55920347cd11655781/artifacts/roc_curve.png'")]