The objective of this notebook is to run BOW and Naive Bayes as our baseline model to have a reference in our experimentation


## MLFlow configuration

In [14]:
import mlflow
from mlflow.exceptions import RestException

In [15]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.search_experiments()

[<Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/experimentation/artifacts/1', creation_time=1733308701193, experiment_id='1', last_update_time=1733308701193, lifecycle_stage='active', name='spam-classifier', tags={'mlflow.note.content': 'This experiment contains mlruns for different '
                         'approaches in the ml lifecycle of an e-mail spam '
                         'detector classifier.',
  'project_name': 'spam-classifier',
  'project_quarter': 'Q4-2024',
  'project_stage': 'testing',
  'team': 'ml-team'}>,
 <Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/experimentation/artifacts/0', creation_time=1733308493408, experiment_id='0', last_update_time=1733308493408, lifecycle_stage='active', name='Default', tags={}>]

In [16]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [17]:
experiment_name = "spam-classifier"

experiment_description = (
    "This experiment contains mlruns for different approaches in the ml lifecycle of an e-mail spam detector classifier."
)

experiment_tags = {
    "project_name": "spam-classifier",
    "project_stage": "testing",
    "team": "ml-team",
    "project_quarter": "Q4-2024",
    "mlflow.note.content": experiment_description,
}



try:
    experiment_id = mlflow.create_experiment(name=experiment_name, tags=experiment_tags)
    print(f"Experiment created with ID: {experiment_id}")
except RestException as e:
    if "RESOURCE_ALREADY_EXISTS" in str(e):
        print(f"Experiment '{experiment_name}' already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    else:
        raise e

print(f"Working with experiment ID: {experiment_id}")

Experiment 'spam-classifier' already exists.
Working with experiment ID: 1


## Datasets

In [18]:
import pandas as pd
from mlflow.models import infer_signature


train = pd.read_csv("../../data/gold/train.csv")
test = pd.read_csv("../../data/gold/test.csv")

X_train = train['features']
y_train = train['target']
X_test = test['features']
y_test = test['target']
signature = infer_signature(X_train, y_train)



## Training model 

In [24]:
import mlflow.sklearn

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    fbeta_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    auc,
    balanced_accuracy_score,
)

import os

artifact_root = "./artifacts_local/mlflow_artifacts"
os.makedirs(artifact_root, exist_ok=True)

pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=2000)),
        ('classifier', MultinomialNB())
    ])
    
pipeline.fit(X_train, y_train)
y_test_pred = pipeline.predict(X_test)


In [25]:
test_accuracy = accuracy_score(y_test, y_test_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_test_pred)

print(f"Accuracy (Test Data): {test_accuracy:.2f} ")
print(f"Balanced Accuracy (Test Data): {balanced_accuracy:.2f}")


Accuracy (Test Data): 0.97 
Balanced Accuracy (Test Data): 0.96


In [26]:
# F0.5-Score
f0_5_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"F0.5-Score: {f0_5_score:.4f}")

test_classification_report = classification_report(y_test, y_test_pred)

print("Classification Report (Test Data):")
print(test_classification_report)

F0.5-Score: 0.8451
Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       453
           1       0.82      0.95      0.88        63

    accuracy                           0.97       516
   macro avg       0.91      0.96      0.93       516
weighted avg       0.97      0.97      0.97       516



Notes:

- Precision on 1: predicts class 1 82% of the times
- Recall on 1: the actual class 1 instances were correctly classified as 1
- F1-score: the model is doing a decent job on the positive class as well.

- Accuracy and macro avg can be ignored

- weighted avg 



In [None]:
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=test_conf_matrix, display_labels=['Ham', 'Spam'])
cm_display.plot(cmap='Blues')
plt.title("Confusion Matrix (Test Data)")
plt.show()

In [None]:
y_test_pred_prob = pipeline.predict_proba(X_test)[:, 1]

precision, recall, _ = precision_recall_curve(y_test, y_test_pred_prob)
pr_auc = auc(recall, precision)

print(f"Precision-Recall AUC (Test Data): {pr_auc:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='b', label=f'PR AUC = {pr_auc:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Test Data)')
plt.legend(loc="best")
plt.grid(True)
plt.show()

## MlFlow tracking

In [189]:

mlflow.set_experiment(experiment_name)
    
with mlflow.start_run(run_name="baseline-model") as run:
    
    mlflow.set_tag("model", "Reference model MultinomialNB + BOW")

    #Datasets
    mlflow.log_param("data_folder", "../../data/gold/")
    mlflow.log_param("train_file", "train.csv")
    mlflow.log_param("test_file", "test.csv")
    
    #BOW and model
    mlflow.log_param("vectorizer_type", "CountVectorizer")
    mlflow.log_param("model_type", "MultinomialNB")
    
    # Metrics
    mlflow.log_metric("train_accuracy", train_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("train_roc_auc", train_roc_auc)
    mlflow.log_metric("test_roc_auc", test_roc_auc)
    mlflow.log_metric("train_f1", train_f1)
    mlflow.log_metric("test_f1", test_f1)
    
    #Artifacts
    mlflow.log_artifact(train_conf_matrix_path, artifact_path="confusion_matrices")
    mlflow.log_artifact(test_conf_matrix_path, artifact_path="confusion_matrices")
    mlflow.log_artifact(roc_curve_path, artifact_path="roc_curves")

    
    # Pipeline
    mlflow.sklearn.log_model(pipeline, "pipeline",signature=signature)
    

    print(f"Pipeline logged to MLflow under run ID {run.info.run_id}")

Pipeline logged to MLflow under run ID cb986f7efd3d42eb8a5d327af93ba59f
🏃 View run baseline-model at: http://127.0.0.1:5000/#/experiments/1/runs/cb986f7efd3d42eb8a5d327af93ba59f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Conclusions:

- Clear overfitting 