# QUESTION 2 K201716 KNN

In [36]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets

In [48]:
dataset = datasets.load_iris()
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1074)


In [38]:
def euclidean(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

In [57]:
from collections import Counter
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self.helper_predict(x) for x in X]
        return predictions

    def helper_predict(self, x):
        # computing the distance
        distances = [euclidean(x, x_train) for x_train in self.X_train]
    
        # getting the closest k neighbours
        k_indices = np.argsort(distances)[:self.k] #gets k closest neighbours
        k_nearest_labels = [self.y_train[i] for i in k_indices] 

        # most common class 
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]

In [58]:
clf = KNN(k=9)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("list of predictions:\n", predictions)
accuracy = np.sum(predictions == y_test) / len(y_test)
print("\nAccuracy of our model: ", accuracy)
matrix= confusion_matrix(y_test, predictions)
print("\nConfusion Matrix: \n", matrix)

list of predictions:
 [2, 1, 1, 0, 1, 1, 2, 2, 0, 0, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 2, 0, 2, 0, 2, 2]

Accuracy of our model:  0.9666666666666667

Confusion Matrix: 
 [[11  0  0]
 [ 0 10  0]
 [ 0  1  8]]


In [59]:
import numpy as np

class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
            

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)

        # return class with the highest posterior
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator


# Testing
if __name__ == "__main__":
    # Imports
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_classes=2, random_state=123
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=123
    )

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)

    print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

Naive Bayes classification accuracy 0.965
