In [20]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB


In [21]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target



In [9]:
df = df.drop_duplicates()

In [15]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

outliers_iqr = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)
print("Number of outliers (IQR):", outliers_iqr.sum())

Number of outliers (IQR): 4


In [16]:
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [22]:
X = df.drop(["target"], axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [23]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score


models = {
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
}


param_grids = {
    "GaussianNB": {
        "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
    },
    "MultinomialNB": {
        "alpha": [0.1, 0.5, 1.0, 2.0],
        "fit_prior": [True, False]
    },
    "ComplementNB": {
        "alpha": [0.1, 0.5, 1.0, 2.0],
        "fit_prior": [True, False],
        "norm": [True, False]
    }
}


In [25]:
best_models = {}
results = {}

for name, model in models.items():
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    
 
    best_models[name] = grid.best_estimator_
    
   
    y_pred = grid.best_estimator_.predict(X_test)
    
  
    results[name] = {
        "best_params": grid.best_params_,
        "cv_score": grid.best_score_,
        "test_accuracy": accuracy_score(y_test, y_pred),
        "report": classification_report(y_test, y_pred, output_dict=True)
    }
    
  
    print(f"Best params: {grid.best_params_}")
    print(f"CV score: {grid.best_score_:.4f}")
    print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))



🔍 Tuning GaussianNB...
Best params: {'var_smoothing': 1e-09}
CV score: 0.9583
Test accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30


🔍 Tuning MultinomialNB...
Best params: {'alpha': 0.5, 'fit_prior': True}
CV score: 0.9500
Test accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


🔍 Tuning ComplementNB..

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Deliverables:

## 1. Data Preprocessing

The dataset was checked for common issues:

Missing values: none found.

Duplicates: 1 duplicate row was found. Removing it reduced model performance, so it was kept.

Outliers: 4 detected via boxplots. Removing them also reduced model performance, so they were kept.

Final dataset was therefore used without removing duplicates or outliers.

## 2. Methodology

Target column: target

Split into train and test sets: 80% training, 20% testing with stratification.

Compared three Naive Bayes algorithms using hyperparameter tuning with cross-validation:

GaussianNB (for continuous features)

MultinomialNB (for count/discrete features)

ComplementNB (variant designed for imbalanced data)

Hyperparameters were tuned using GridSearchCV with 5-fold cross-validation. Models were evaluated on:

Cross-validation accuracy

Test accuracy

Precision, Recall, F1-score

## 3. Results
###  GaussianNB

Best params: var_smoothing=1e-09

CV score: 0.9583

Test accuracy: 0.9667

Classification report (Test set):

Class 0: Precision=1.00, Recall=1.00

Class 1: Precision=1.00, Recall=0.90

Class 2: Precision=0.91, Recall=1.00

Macro Avg F1-score: 0.97

 Performs very well, slight weakness in class 1 recall.

###  MultinomialNB

Best params: alpha=0.5, fit_prior=True

CV score: 0.9500

Test accuracy: 1.0000

Classification report (Test set):

All classes: Precision=1.00, Recall=1.00, F1=1.00

Macro Avg F1-score: 1.00

Best performing model — perfect classification on the test set.

###  ComplementNB

Best params: alpha=0.1, fit_prior=True, norm=False

CV score: 0.6667

Test accuracy: 0.6667

Classification report (Test set):

Class 0: Precision=1.00, Recall=1.00

Class 1: Precision=0.00, Recall=0.00

Class 2: Precision=0.50, Recall=1.00

Macro Avg F1-score: 0.56

Performs poorly — fails to predict class 1 correctly. Not suitable for this dataset.