Assignment 6

ques 1

In [1]:
# Cell 1: Libraries
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
# Cell 2: Load Iris dataset
iris = load_iris()
X = iris.data          # 4 features
y = iris.target        # 3 classes: 0,1,2
feature_names = iris.feature_names
target_names = iris.target_names

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Cell 3: GNB from scratch
class MyGaussianNB:
    """
    Simple Gaussian Naive Bayes:
    - Priors: P(y=c) = count(c)/N
    - Likelihood per feature (Gaussian):
      p(x|y=c) = N(x; mean_{c,f}, var_{c,f})
    - We work in log-space to avoid underflow.
    """
    def __init__(self, var_smoothing=1e-9):
        self.var_smoothing = var_smoothing  # small value to stabilize variance

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        n_features = X.shape[1]

        self.priors_log_ = {}
        self.mean_ = {}
        self.var_ = {}

        for c in self.classes_:
            Xc = X[y == c]
            self.priors_log_[c] = np.log(Xc.shape[0] / X.shape[0])
            # per-feature mean/var
            mu = Xc.mean(axis=0)
            var = Xc.var(axis=0) + self.var_smoothing
            self.mean_[c] = mu
            self.var_[c] = var
        return self

    def _log_gaussian(self, x, mu, var):
        # elementwise log of Gaussian PDF (without constants combined)
        return -0.5 * (np.log(2*np.pi*var) + ((x - mu)**2) / var)

    def _joint_log_likelihood(self, X):
        jll = []
        for c in self.classes_:
            mu = self.mean_[c]
            var = self.var_[c]
            # sum over features (naive independence)
            log_likelihood = self._log_gaussian(X, mu, var).sum(axis=1)
            jll.append(self.priors_log_[c] + log_likelihood)
        return np.vstack(jll).T  # shape: [n_samples, n_classes]

    def predict(self, X):
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    def predict_proba(self, X):
        jll = self._joint_log_likelihood(X)
        # convert log-joint to normalized probabilities
        # softmax
        jll -= jll.max(axis=1, keepdims=True)
        P = np.exp(jll)
        P /= P.sum(axis=1, keepdims=True)
        return P

# Train & Evaluate our scratch model
scratch_gnb = MyGaussianNB(var_smoothing=1e-9).fit(X_train, y_train)
y_pred_scratch = scratch_gnb.predict(X_test)

print("Accuracy (Scratch GNB):", accuracy_score(y_test, y_pred_scratch))
print("\nConfusion Matrix (Scratch GNB):\n", confusion_matrix(y_test, y_pred_scratch))
print("\nClassification Report (Scratch GNB):\n", classification_report(y_test, y_pred_scratch, target_names=target_names))


Accuracy (Scratch GNB): 0.9666666666666667

Confusion Matrix (Scratch GNB):
 [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

Classification Report (Scratch GNB):
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [3]:
# Cell 4: Using sklearn GaussianNB
sk_gnb = GaussianNB(var_smoothing=1e-9)
sk_gnb.fit(X_train, y_train)
y_pred_sk = sk_gnb.predict(X_test)

print("Accuracy (sklearn GaussianNB):", accuracy_score(y_test, y_pred_sk))
print("\nConfusion Matrix (sklearn GNB):\n", confusion_matrix(y_test, y_pred_sk))
print("\nClassification Report (sklearn GNB):\n", classification_report(y_test, y_pred_sk, target_names=target_names))


Accuracy (sklearn GaussianNB): 0.9666666666666667

Confusion Matrix (sklearn GNB):
 [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

Classification Report (sklearn GNB):
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



ques 2

In [4]:
# Cell 5: Pipeline + GridSearchCV for KNN
# KNN ko scaling chahiye hoti hai, isliye Pipeline = StandardScaler -> KNN
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': list(range(1, 31)),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # 1=Manhattan, 2=Euclidean
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,            # 5-fold cross-validation
    n_jobs=-1,
    return_train_score=False
)

grid.fit(X_train, y_train)

print("Best CV Accuracy:", grid.best_score_)
print("Best Params:", grid.best_params_)

# Final evaluation on the held-out test set:
best_model = grid.best_estimator_
y_pred_knn = best_model.predict(X_test)
print("\nTest Accuracy (Best KNN):", accuracy_score(y_test, y_pred_knn))
print("\nConfusion Matrix (Best KNN):\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report (Best KNN):\n", classification_report(y_test, y_pred_knn, target_names=target_names))


Best CV Accuracy: 0.975
Best Params: {'knn__n_neighbors': 17, 'knn__p': 2, 'knn__weights': 'distance'}

Test Accuracy (Best KNN): 0.9666666666666667

Confusion Matrix (Best KNN):
 [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

Classification Report (Best KNN):
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

