### Machine Learning - PPG IC-UFF 2017.1
# Naive Bayes & KNN

In [5]:
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
from collections import Counter
from operator import itemgetter
from scipy.stats import multivariate_normal as mvn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

iris = load_iris()
X = iris.data
y = iris.target

## 1 - Naive Bayes

In [6]:
class NaiveBayes():
    def __init__(self, params=None):
        'If params is None the method is initialized with default values.'
        pass
    
    def predict(self,X):
        classes_gaussians = len(gaussians)
        examples, features_number = X.shape
        probabilities = np.zeros((examples,classes_gaussians))

        for cls,gaussian in gaussians.items():
            mean, var = gaussian['mean'], gaussian['var']
            probabilities[:,cls] = mvn.pdf(X, mean=mean, cov=var) * classes_prior_probs[cls]
        return np.argmax(probabilities, axis=1)
    
    def learn(self,X,y):
        classes = set(y)
        self.gaussians = {}
        self.classes_prior_probs = {}
        for label in classes:
            evidence = X[y == label]
            self.gaussians[label] = {
                'mean': evidence.mean(axis=0),
                'var': evidence.var(axis=0)
            }
            classes_prior_probs[label] = (len(y[y==label])/len(y))


In [7]:
nb = NaiveBayes()

skf = StratifiedKFold(n_splits=10)
acc_scores = []
recall_scores = []
f1_scores = []
precision_scores = []
for train, test in skf.split(X, y):
    nb.learn(X[train], y[train])
    y_pred = nb.predict(X[test])
    acc_scores.append(accuracy_score(y[test], y_pred))
    recall_scores.append(recall_score(y[test], y_pred, average='micro'))
    f1_scores.append(f1_score(y[test], y_pred, average='micro'))
    precision_scores.append(precision_score(y[test], y_pred, average='micro'))

print("Cross-Validation\n")
print("Mean Accuracy: %s" % (np.mean(acc_scores) * 100))
print("Standard deviation accuracy: %s" %  np.array(acc_scores).std())
print("Mean Recall: %s" % (np.mean(recall_scores) * 100))
print("Standard deviation accuracy: %s" %  np.array(recall_scores).std())
print("Mean F1-Score: %s" % (np.mean(f1_scores) * 100))
print("Standard deviation accuracy: %s" %  np.array(f1_scores).std())
print("Mean Precision: %s" % (np.mean(precision_scores) * 100))
print("Standard deviation accuracy: %s" %  np.array(precision_scores).std())


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
nb.learn(X_train,y_train)
y_pred = nb.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pred)


plt.figure()

plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(set(y)))
plt.xticks(tick_marks, set(y), rotation=45)
plt.yticks(tick_marks, set(y))

thresh = cnf_matrix.max() / 2.
for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
    plt.text(j, i, cnf_matrix[i, j],
             horizontalalignment="center",
             color="white" if cnf_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

NameError: name 'classes_prior_probs' is not defined

## 2 - KNN

In [10]:
class KNN():
    def __init__(self, params=None):
        self.K = params['k']
        'If params is None the method is initialized with default values.'
        pass
    
    def predict(self,X):
        if isinstance(X[0],np.ndarray):
            resp = list()
            for x in X:
                resp.append(predict_instance(x))
            return resp
        else:
            predict_instance(X)
    
    def predict_instance(self,x):
        neighbours = getNeighbors(self.training_set,X,self.K)
        classes = [neighbour[0][1] for neighbour in neighbours]
        count = Counter(classes)
        return count.most_common()[0][0]
    
    
    def learn(self,X,y):
        self.training_set = np.array(list(zip(X,y)))
    
    def euclidean_distance(instance1, instance2, length = 2):
        distance = 0
        for x in range(length):
            distance += pow((instance1[x] - instance2[x]), length)
        return math.sqrt(distance)
    
    def _get_tuple_distance(training_instance, test_instance):
        return (training_instance, euclidean_distance(test_instance, training_instance[0],2))

    def getNeighbors(training_set, test_instance, k):
        distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in training_set]
        distances = sorted(distances,key=itemgetter(1))
        return distances[:k]

In [11]:
knn = KNN(params={'k':5})
knn.learn(X,y)
knn.predict(X[0])

type(X)
isinstance(X[0],np.ndarray)

NameError: name 'predict_instance' is not defined