In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
"""
Load UCI ML Iris data
Return: data(shape = (150, 4)) and labels(shape = (150, 1)) in numpy array(rank 2)
"""
def load_data(classes):
    data = pd.read_csv('Iris.csv', index_col=0).as_matrix()
    features = data[:, :-1]
    labels = data[:, -1].reshape(-1, 1)
    for class_id in classes:
        labels[labels == class_id[0]] = class_id[1]
    return features.astype(np.float32), labels.astype(np.float32)

In [3]:
"""
Perform Data Shuffling
Return: shuffled features and labels
"""
def data_shuffling(features, labels):
    shuffle_array = np.random.permutation(features.shape[0])
    features = features[shuffle_array]
    labels = labels[shuffle_array]
    return features, labels

In [4]:
"""
Splitting the data into train and test set
Variable: ratio = percentage of data for train set
Return: train set and test set
"""
def data_split(features, labels, ratio=0.7):
    train_end_index = np.ceil(features.shape[0] * 0.7).astype(np.int32)
    train_features = features[:train_end_index]
    train_labels = labels[:train_end_index]
    test_features = features[train_end_index:]
    test_labels = labels[train_end_index:]
    return train_features, train_labels, test_features, test_labels

In [5]:
"""
Compute the probability density using gaussian distribution
"""
def get_prob_den(x, sample_var, mean):
    prob_den = (1 / np.sqrt(2 * np.pi * sample_var)) * np.exp(-np.square(x - mean)/(2 * sample_var))
    return prob_den

In [6]:
"""
Build Naive Bayes Classifier and preform inferencing
Return: predictions to test set
"""
def naive_bayes_classifier(train_features, train_labels, test_features):
    p_c = {}
    var = {}
    mean = {}
    for i in range(3):
        p_c[str(i)] = np.sum((train_labels[:, 0] == np.float(i)).astype(np.int32)) / train_labels.shape[0]
        for j in range(4):
            var['c{}x{}'.format(i, j + 1)] = np.var(train_features[train_labels[:, 0] == np.float(i)][:, j]) * train_features.shape[0] / (train_features.shape[0] - 1) #sample variance
            mean['c{}x{}'.format(i, j + 1)] = np.mean(train_features[train_labels[:, 0] == np.float(i)][:, j])
    predictions = []
    for k in range(test_features.shape[0]):
        probs = []
        for i in range(3):
            prob = p_c[str(i)]
            for j in range(4):
                prob *= get_prob_den(test_features[k, j], var['c{}x{}'.format(i, j + 1)], mean['c{}x{}'.format(i, j + 1)])
            probs.append(prob)
        predict = np.argmax(probs)
        predictions.append((predict))
    predictions = np.array(predictions).reshape(-1, 1)
    return predictions

In [7]:
"""
Settings
"""
classes = [('Iris-setosa', 0), ('Iris-versicolor', 1), ('Iris-virginica', 2)]
cross_val_times = 1000

In [8]:
features, labels = load_data(classes)
accuracy = 0.
for i in range(cross_val_times):
    features, labels = data_shuffling(features, labels)
    train_features, train_labels, test_features, test_labels = data_split(features, labels)
    predictions = naive_bayes_classifier(train_features, train_labels, test_features)
    accuracy += np.mean((predictions == test_labels).astype(np.float32)) / cross_val_times
print('Accuracy ({} times average): {}'.format(cross_val_times, accuracy))

Accuracy (1000 times average): 0.9552444471120919
