# Assignment 6 - Naive Bayes & KNN Solutions
This notebook contains solutions for all questions in Lab Assignment 6.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

---
# Question 1: Gaussian Naive Bayes Classifier on Iris Dataset

## Load and Prepare Data

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

print(f"Dataset shape: {X.shape}")
print(f"Features: {iris.feature_names}")
print(f"Classes: {iris.target_names}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

## Part (i): Step-by-Step Implementation from Scratch

In [None]:
class GaussianNaiveBayesFromScratch:
    """
    Gaussian Naive Bayes classifier implemented from scratch.
    """
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.var = {}
        self.priors = {}
    
    def fit(self, X, y):
        """Train the classifier by calculating mean, variance, and priors for each class."""
        self.classes = np.unique(y)
        n_samples = X.shape[0]
        
        for c in self.classes:
            X_c = X[y == c]
            self.priors[c] = X_c.shape[0] / n_samples
            self.mean[c] = np.mean(X_c, axis=0)
            self.var[c] = np.var(X_c, axis=0)
        
        print("Model trained successfully!")
        print(f"Classes: {self.classes}")
        print(f"Priors: {self.priors}")
    
    def _calculate_likelihood(self, x, mean, var):
        """Calculate Gaussian likelihood."""
        eps = 1e-6
        coefficient = 1.0 / np.sqrt(2 * np.pi * var + eps)
        exponent = np.exp(-((x - mean) ** 2) / (2 * var + eps))
        return coefficient * exponent
    
    def _calculate_posterior(self, x):
        """Calculate posterior probability for each class."""
        posteriors = {}
        for c in self.classes:
            posterior = np.log(self.priors[c])
            for i in range(len(x)):
                likelihood = self._calculate_likelihood(x[i], self.mean[c][i], self.var[c][i])
                posterior += np.log(likelihood + 1e-10)
            posteriors[c] = posterior
        return posteriors
    
    def predict(self, X):
        """Predict class labels for samples in X."""
        predictions = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            predicted_class = max(posteriors, key=posteriors.get)
            predictions.append(predicted_class)
        return np.array(predictions)

In [None]:
# Train and test custom implementation
print("="*70)
print("PART (i): STEP-BY-STEP IMPLEMENTATION")
print("="*70)

gnb_custom = GaussianNaiveBayesFromScratch()
gnb_custom.fit(X_train, y_train)
y_pred_custom = gnb_custom.predict(X_test)

In [None]:
# Results for custom implementation
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom, target_names=iris.target_names))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom))

## Part (ii): Using Built-in Sklearn Function

In [None]:
print("="*70)
print("PART (ii): USING BUILT-IN FUNCTION")
print("="*70)

# Built-in GaussianNB
gnb_sklearn = GaussianNB()
gnb_sklearn.fit(X_train, y_train)
y_pred_sklearn = gnb_sklearn.predict(X_test)

print(f"\nAccuracy: {accuracy_score(y_test, y_pred_sklearn):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sklearn, target_names=iris.target_names))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_sklearn))

In [None]:
# Compare both implementations
print("="*70)
print("COMPARISON")
print("="*70)
print(f"Custom Implementation Accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print(f"Sklearn Implementation Accuracy: {accuracy_score(y_test, y_pred_sklearn):.4f}")

---
# Question 2: GridSearchCV for K-NN Classifier Hyperparameter Tuning

In [None]:
print("="*70)
print("GRIDSEARCHCV FOR K-NN CLASSIFIER")
print("="*70)
print(f"Dataset: Iris")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

In [None]:
# Define parameter grid for K values
param_grid = {
    'n_neighbors': list(range(1, 31)),  # K values from 1 to 30
    'weights': ['uniform', 'distance'],  # Weight functions
    'metric': ['euclidean', 'manhattan']  # Distance metrics
}

print("\nParameter Grid:")
print(f"  n_neighbors: 1 to 30")
print(f"  weights: {param_grid['weights']}")
print(f"  metric: {param_grid['metric']}")
print(f"\nTotal combinations: {len(param_grid['n_neighbors']) * len(param_grid['weights']) * len(param_grid['metric'])}")

In [None]:
# Perform GridSearchCV
print("\nPerforming GridSearchCV with 5-fold cross-validation...")

knn = KNeighborsClassifier()
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [None]:
# Display results
print("\n" + "="*70)
print("GRIDSEARCHCV RESULTS")
print("="*70)
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")
print(f"Best K value: {grid_search.best_params_['n_neighbors']}")

In [None]:
# Test the best model
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nTest Set Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Top 10 parameter combinations
print("\n" + "="*70)
print("TOP 10 PARAMETER COMBINATIONS")
print("="*70)

results_df = pd.DataFrame(grid_search.cv_results_)
top_10 = results_df.nlargest(10, 'mean_test_score')[['param_n_neighbors', 'param_weights', 'param_metric', 'mean_test_score', 'std_test_score']]
print(top_10.to_string(index=False))

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: K value vs Accuracy for different weights (euclidean)
euclidean_uniform = results_df[(results_df['param_metric'] == 'euclidean') & (results_df['param_weights'] == 'uniform')]
euclidean_distance = results_df[(results_df['param_metric'] == 'euclidean') & (results_df['param_weights'] == 'distance')]

axes[0, 0].plot(euclidean_uniform['param_n_neighbors'], euclidean_uniform['mean_test_score'], 
                marker='o', label='Uniform', linewidth=2)
axes[0, 0].plot(euclidean_distance['param_n_neighbors'], euclidean_distance['mean_test_score'], 
                marker='s', label='Distance', linewidth=2)
axes[0, 0].axvline(grid_search.best_params_['n_neighbors'], color='red', 
                   linestyle='--', label=f"Best K = {grid_search.best_params_['n_neighbors']}")
axes[0, 0].set_xlabel('K (Number of Neighbors)')
axes[0, 0].set_ylabel('Cross-Validation Accuracy')
axes[0, 0].set_title('K vs Accuracy (Euclidean Distance)')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Plot 2: K value vs Accuracy for different metrics (uniform)
uniform_euclidean = results_df[(results_df['param_weights'] == 'uniform') & (results_df['param_metric'] == 'euclidean')]
uniform_manhattan = results_df[(results_df['param_weights'] == 'uniform') & (results_df['param_metric'] == 'manhattan')]

axes[0, 1].plot(uniform_euclidean['param_n_neighbors'], uniform_euclidean['mean_test_score'], 
                marker='o', label='Euclidean', linewidth=2)
axes[0, 1].plot(uniform_manhattan['param_n_neighbors'], uniform_manhattan['mean_test_score'], 
                marker='s', label='Manhattan', linewidth=2)
axes[0, 1].set_xlabel('K (Number of Neighbors)')
axes[0, 1].set_ylabel('Cross-Validation Accuracy')
axes[0, 1].set_title('K vs Accuracy (Uniform Weights)')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Plot 3: Heatmap of mean scores
pivot_table = results_df[results_df['param_metric'] == 'euclidean'].pivot_table(
    values='mean_test_score', 
    index='param_weights', 
    columns='param_n_neighbors'
)
sns.heatmap(pivot_table, annot=False, cmap='YlGnBu', ax=axes[1, 0], cbar_kws={'label': 'Accuracy'})
axes[1, 0].set_title('Accuracy Heatmap (Euclidean Distance)')
axes[1, 0].set_xlabel('K (Number of Neighbors)')
axes[1, 0].set_ylabel('Weight Function')

# Plot 4: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names, ax=axes[1, 1])
axes[1, 1].set_title(f'Confusion Matrix (Best Model: K={grid_search.best_params_["n_neighbors"]})')
axes[1, 1].set_ylabel('True Label')
axes[1, 1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('gridsearch_knn_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualizations saved!")

---
## Summary
- **Q1**: Implemented Gaussian Naive Bayes from scratch and compared with sklearn
- **Q2**: Used GridSearchCV to find optimal K-NN hyperparameters (n_neighbors, weights, metric)