Q1) (Gaussian Naïve Bayes Classifier) Implement Gaussian Naïve Bayes
Classifier on the Iris dataset from sklearn.datasets using
(i) Step-by-step implementation
(ii) In-built function

In [1]:

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

iris = datasets.load_iris()
X = iris.data
y = iris.target


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


classes = np.unique(y_train)
n_classes = classes.shape[0]
n_features = X_train.shape[1]


priors = np.zeros(n_classes)
means = np.zeros((n_classes, n_features))
vars_ = np.zeros((n_classes, n_features))

for idx, c in enumerate(classes):
    X_c = X_train[y_train == c]
    priors[idx] = X_c.shape[0] / X_train.shape[0]
    means[idx, :] = X_c.mean(axis=0)

    vars_[idx, :] = X_c.var(axis=0) + 1e-9

def log_gaussian_pdf(x, mean, var):

    return -0.5 * (np.log(2.0 * np.pi * var) + ((x - mean) ** 2) / var)

def predict(X):
    n_samples = X.shape[0]
    log_posteriors = np.zeros((n_samples, n_classes))

    for idx in range(n_classes):

        log_prior = np.log(priors[idx])

        log_likelihoods = log_gaussian_pdf(X, means[idx], vars_[idx])
        log_likelihood_sum = np.sum(log_likelihoods, axis=1)
        log_posteriors[:, idx] = log_prior + log_likelihood_sum


    predicted_indices = np.argmax(log_posteriors, axis=1)
    return classes[predicted_indices]

y_pred = predict(X_test)
print("From-scratch Gaussian NB results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))


From-scratch Gaussian NB results
Accuracy: 0.9111111111111111

Confusion matrix:
 [[15  0  0]
 [ 0 14  1]
 [ 0  3 12]]

Classification report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.82      0.93      0.88        15
   virginica       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



In [4]:

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


gnb = GaussianNB()
gnb.fit(X_train, y_train)


y_pred = gnb.predict(X_test)

print("Sklearn GaussianNB Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))


print("\nClass Priors:", gnb.class_prior_)
print("Class-wise Means:\n", gnb.theta_)
print("Class-wise Variances:\n", gnb.var_)


Sklearn GaussianNB Results
Accuracy: 0.9111111111111111
Confusion Matrix:
 [[15  0  0]
 [ 0 14  1]
 [ 0  3 12]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.82      0.93      0.88        15
   virginica       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45


Class Priors: [0.33333333 0.33333333 0.33333333]
Class-wise Means:
 [[4.98857143 3.42571429 1.48571429 0.24      ]
 [5.94857143 2.73142857 4.23714286 1.30857143]
 [6.68285714 3.00857143 5.63142857 2.06857143]]
Class-wise Variances:
 [[0.10329796 0.17391021 0.02293878 0.00925715]
 [0.24078368 0.08558368 0.21147755 0.03564082]
 [0.42484898 0.11735511 0.32272653 0.06386939]]


Q2)Explore about GridSearchCV toot in scikit-learn. This is a tool that is
often used for tuning hyperparameters of machine learning models. Use
this tool to find the best value of K for K-NN Classifier using any dataset.

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

iris = load_iris()
X = iris.data
y = iris.target


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


knn = KNeighborsClassifier()


param_grid = {
    "n_neighbors": list(range(1, 31))
}

grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)


grid.fit(X_train, y_train)


print("Best K value:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Best K value: {'n_neighbors': 9}
Best CV accuracy: 0.980952380952381

Test Accuracy: 0.9555555555555556

Confusion Matrix:
 [[15  0  0]
 [ 0 15  0]
 [ 0  2 13]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.88      1.00      0.94        15
           2       1.00      0.87      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

