In [5]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------------------------------------------------
# Load Data and Split for Both Questions
# -------------------------------------------------------------------
# Load the Iris dataset
X, y = load_iris(return_X_y=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("--- Data Loaded and Split ---")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 30 + "\n")

--- Data Loaded and Split ---
X_train shape: (105, 4)
X_test shape: (45, 4)
------------------------------



# **QUES 1**

**(Gaussian Naïve Bayes Classifier)** 
<br>
**Implement Gaussian Naïve Bayes**
<br>
**Classifier on the Iris dataset from sklearn.datasets using
i. Step-by-step implementation
ii. In-built function**

In [6]:
class CustomGaussianNB:
    """
    Custom implementation of Gaussian Naïve Bayes classifier.
    """
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Initialize arrays for mean, variance, and priors
        # Shape: (n_classes, n_features)
        self.mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self.var = np.zeros((n_classes, n_features), dtype=np.float64)
        # Shape: (n_classes,)
        self.priors = np.zeros(n_classes, dtype=np.float64)

        # Add a small value (epsilon) for numerical stability
        self.epsilon = 1e-9

        # Calculate mean, variance, and prior for each class
        for idx, c in enumerate(self.classes):
            # Get all samples belonging to class c
            X_c = X[y == c]
            
            # Calculate mean and variance (feature-wise)
            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0) + self.epsilon
            
            # Calculate prior probability of class c
            self.priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        # Predict the class for each sample in X
        y_pred = [self._predict_sample(x) for x in X]
        return np.array(y_pred)

    def _predict_sample(self, x):
        # Calculate posterior probability for each class
        posteriors = []

        for idx, c in enumerate(self.classes):
            # Calculate log prior
            log_prior = np.log(self.priors[idx])
            
            # Calculate log-likelihood using the Gaussian PDF formula
            # log(P(x_i | c)) = -0.5 * log(2*pi*var_i) - 0.5 * ((x_i - mean_i)^2 / var_i)
            log_likelihood_features = -0.5 * np.log(2 * np.pi * self.var[idx, :]) - \
                                      0.5 * ((x - self.mean[idx, :])**2 / self.var[idx, :])
            
            # Sum log-likelihoods over all features
            total_log_likelihood = np.sum(log_likelihood_features)
            
            # Calculate posterior (sum of log prior and log likelihood)
            posterior = log_prior + total_log_likelihood
            posteriors.append(posterior)

        # Return the class with the highest posterior probability
        return self.classes[np.argmax(posteriors)]

print("--- (i) Step-by-Step Implementation ---")
custom_gnb = CustomGaussianNB()
custom_gnb.fit(X_train, y_train)
y_pred_custom = custom_gnb.predict(X_test)
custom_accuracy = accuracy_score(y_test, y_pred_custom)

print(f"Custom GNB Accuracy: {custom_accuracy:.4f}")
print("Classification Report (Custom GNB):")
print(classification_report(y_test, y_pred_custom))


# --- (ii) In-built function implementation  ---

print("--- (ii) In-built scikit-learn Implementation ---")
sklearn_gnb = GaussianNB()
sklearn_gnb.fit(X_train, y_train)
y_pred_sklearn = sklearn_gnb.predict(X_test)
sklearn_accuracy = accuracy_score(y_test, y_pred_sklearn)

print(f"scikit-learn GNB Accuracy: {sklearn_accuracy:.4f}")
print("Classification Report (scikit-learn GNB):")
print(classification_report(y_test, y_pred_sklearn))
print("-" * 30 + "\n")

--- (i) Step-by-Step Implementation ---
Custom GNB Accuracy: 0.9778
Classification Report (Custom GNB):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45

--- (ii) In-built scikit-learn Implementation ---
scikit-learn GNB Accuracy: 0.9778
Classification Report (scikit-learn GNB):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45

--------

# **QUES 2**

**Explore about GridSearchCV toot in scikit-learn. This is a tool that is
often used for tuning hyperparameters of machine learning models. Use
this tool to find the best value of K for K-NN Classifier using any dataset.**

In [7]:
# 1. Define the model
knn = KNeighborsClassifier()

# 2. Define the parameter grid to search
# We'll search for the best K from 1 to 30
param_grid = {
    'n_neighbors': list(range(1, 31))
}

# 3. Instantiate GridSearchCV
# cv=5 means 5-fold cross-validation
# verbose=1 shows progress
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

# 4. Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# 5. Print the best parameters and the best score
print("\nGridSearchCV fitting complete.")
print(f"Best K value (n_neighbors): {grid_search.best_params_['n_neighbors']}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# 6. Use the best model (best_estimator_) to make predictions on the test set
best_knn_model = grid_search.best_estimator_
y_pred_knn = best_knn_model.predict(X_test)
knn_test_accuracy = accuracy_score(y_test, y_pred_knn)

print(f"\nAccuracy of the best K-NN model on the TEST set: {knn_test_accuracy:.4f}")
print("Classification Report (Best K-NN):")
print(classification_report(y_test, y_pred_knn))
print("-" * 30 + "\n")

Fitting 5 folds for each of 30 candidates, totalling 150 fits

GridSearchCV fitting complete.
Best K value (n_neighbors): 1
Best cross-validation accuracy: 0.9524

Accuracy of the best K-NN model on the TEST set: 1.0000
Classification Report (Best K-NN):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

------------------------------

