# Naive Bayes

Train and save a Naive Bayes model (Gaussian or Multinomial). Uses `data.csv` if present, else Iris dataset.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
import joblib
import os

# choose variant here: 'gaussian' or 'multinomial'
variant = 'gaussian'

if os.path.exists('data.csv'):
    df = pd.read_csv('data.csv')
else:
    from sklearn.datasets import load_iris
    iris = load_iris(as_frame=True)
    df = iris.frame.copy()
    if 'target' not in df.columns:
        df['target'] = iris.target

target = 'target'
X = df.drop(columns=[target])
y = df[target]

numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[('num', Pipeline([('imputer', SimpleImputer()), ('scaler', StandardScaler())]), numeric_cols)] + ([('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)] if len(categorical_cols)>0 else []))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y if len(np.unique(y))>1 else None)

if variant=='gaussian':
    model = GaussianNB()
else:
    model = MultinomialNB()

pipeline = Pipeline([('preprocessor', preprocessor), ('clf', model)])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test) if hasattr(pipeline, 'predict_proba') else None

metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
    'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
    'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0),
    'mcc': matthews_corrcoef(y_test, y_pred)
}

if y_proba is not None:
    try:
        if len(np.unique(y))==2:
            metrics['auc'] = roc_auc_score(y_test, y_proba[:,1])
        else:
            from sklearn.preprocessing import label_binarize
            y_test_b = label_binarize(y_test, classes=np.unique(y))
            metrics['auc'] = roc_auc_score(y_test_b, y_proba, multi_class='ovr', average='macro')
    except Exception:
        metrics['auc'] = None
else:
    metrics['auc'] = None

print('Metrics:', metrics)
os.makedirs('model', exist_ok=True)
joblib.dump(pipeline, os.path.join('model','naive_bayes_model.joblib'))
print('Model saved to model/naive_bayes_model.joblib')