In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import os

RANDOM_STATE = 42
PLOT_FILE = "knn_cv_plot.png"
CSV_FILE = "classification_results_summary.csv"


iris = load_iris()
X, y = iris.data, iris.target
class_names = iris.target_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)


class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]
        self.mean = np.zeros((len(self.classes), n_features))
        self.var = np.zeros((len(self.classes), n_features))
        self.prior = np.zeros(len(self.classes))
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx, :] = X_c.mean(axis=0)
            # ML estimate of variance (ddof=0)
            self.var[idx, :] = X_c.var(axis=0)
            self.prior[idx] = X_c.shape[0] / X.shape[0]
        return self

    def _gaussian_log_likelihood(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        # avoid division by zero
        var = np.where(var == 0, 1e-9, var)
        # log of gaussian pdf per feature
        log_likelihood = -0.5 * np.log(2 * np.pi * var) - ((x - mean) ** 2) / (2 * var)
        # sum over features
        return log_likelihood.sum()

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = []
            for idx, _ in enumerate(self.classes):
                log_prior = np.log(self.prior[idx])
                log_likelihood = self._gaussian_log_likelihood(idx, x)
                posteriors.append(log_prior + log_likelihood)
            y_pred.append(self.classes[np.argmax(posteriors)])
        return np.array(y_pred)


gnb_scratch = GaussianNaiveBayes().fit(X_train, y_train)
y_pred_scratch = gnb_scratch.predict(X_test)
acc_scratch = accuracy_score(y_test, y_pred_scratch)
cm_scratch = confusion_matrix(y_test, y_pred_scratch)
report_scratch = classification_report(y_test, y_pred_scratch, target_names=class_names, zero_division=0)


gnb_sklearn = GaussianNB().fit(X_train, y_train)
y_pred_sklearn = gnb_sklearn.predict(X_test)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)
cm_sklearn = confusion_matrix(y_test, y_pred_sklearn)
report_sklearn = classification_report(y_test, y_pred_sklearn, target_names=class_names, zero_division=0)


param_grid = {'n_neighbors': list(range(1, 21))}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid.fit(X_train, y_train)

best_k = int(grid.best_params_['n_neighbors'])
best_score = float(grid.best_score_)

cv_results = pd.DataFrame(grid.cv_results_)[
    ['param_n_neighbors', 'mean_test_score', 'std_test_score', 'mean_train_score']
].rename(columns={
    'param_n_neighbors': 'n_neighbors',
    'mean_test_score': 'mean_cv_test_score',
    'std_test_score': 'std_cv_test_score',
    'mean_train_score': 'mean_cv_train_score'
}).sort_values('n_neighbors').reset_index(drop=True)
# convert param objects to int if needed
cv_results['n_neighbors'] = cv_results['n_neighbors'].apply(lambda v: int(v))

# Train final KNN with best K and evaluate on test set
knn_best = KNeighborsClassifier(n_neighbors=best_k).fit(X_train, y_train)
y_pred_knn = knn_best.predict(X_test)
acc_knn_test = accuracy_score(y_test, y_pred_knn)
cm_knn = confusion_matrix(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn, target_names=class_names, zero_division=0)


plt.figure(figsize=(8, 5))
plt.errorbar(
    cv_results['n_neighbors'].tolist(),
    cv_results['mean_cv_test_score'].tolist(),
    yerr=cv_results['std_cv_test_score'].tolist(),
    marker='o', linestyle='-'
)
plt.title('K-NN: CV mean test accuracy vs K')
plt.xlabel('K (n_neighbors)')
plt.ylabel('Mean CV accuracy')
plt.xticks(cv_results['n_neighbors'].tolist())
plt.grid(True)
plt.tight_layout()
plt.savefig(PLOT_FILE)
plt.close()


results_df = pd.DataFrame([
    ['GNB (scratch)', float(acc_scratch)],
    ['GNB (sklearn)', float(acc_sklearn)],
    [f'KNN (k={best_k})', float(acc_knn_test)]
], columns=['model', 'test_accuracy'])
results_df.to_csv(CSV_FILE, index=False)


print("===== Gaussian Naive Bayes (From scratch) =====")
print(f"Test accuracy: {acc_scratch:.4f}")
print("Confusion matrix (true x pred):")
print(cm_scratch)
print("Classification report:")
print(report_scratch)

print("===== Gaussian Naive Bayes (sklearn) =====")
print(f"Test accuracy: {acc_sklearn:.4f}")
print("Confusion matrix (true x pred):")
print(cm_sklearn)
print("Classification report:")
print(report_sklearn)

print("===== K-NN (GridSearchCV) =====")
print(f"Best K (n_neighbors): {best_k}")
print(f"Best mean CV accuracy (on training folds): {best_score:.4f}")
print(f"Test accuracy with best K: {acc_knn_test:.4f}")
print("Confusion matrix (true x pred) for final KNN:")
print(cm_knn)
print("Classification report for final KNN:")
print(report_knn)

print("\nCV results (n_neighbors, mean_cv_test_score, std_cv_test_score, mean_cv_train_score):")
for _, row in cv_results.iterrows():
    print(int(row['n_neighbors']),
          f"{row['mean_cv_test_score']:.4f}",
          f"{row['std_cv_test_score']:.4f}",
          f"{row['mean_cv_train_score']:.4f}")

print(f"\nSaved files:\n - {os.path.abspath(PLOT_FILE)}\n - {os.path.abspath(CSV_FILE)}")


===== Gaussian Naive Bayes (From scratch) =====
Test accuracy: 0.9211
Confusion matrix (true x pred):
[[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38

===== Gaussian Naive Bayes (sklearn) =====
Test accuracy: 0.9211
Confusion matrix (true x pred):
[[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92     