# IT-542 PRML Assignment - 4

In [16]:
# Import and load iris dataset
from math import sqrt, exp, pi
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

iris = load_iris()
print(iris)

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [17]:
# Evaluate Naive Bayes algorithm
def evaluate_algorithm(dataset, labels):
    train_set_data, test_set_data, \
    train_set_labels, test_set_labels = train_test_split(dataset, labels, test_size=0.2, stratify=labels)
    
    # Predict classes using our implementation
    predicted = naive_bayes(train_set_data, train_set_labels, test_set_data)

    # Using Sklearn object to predict classes
    sk_gaussian_nb_classifier = GaussianNB()
    sk_gaussian_nb_classifier.fit(train_set_data, train_set_labels)
    sk_predicted = sk_gaussian_nb_classifier.predict(test_set_data)

    # Calculating accuracy for both implementations
    accuracy = accuracy_score(test_set_labels, predicted)
    sk_accuracy = accuracy_score(test_set_labels, sk_predicted)

    return predicted, sk_predicted, accuracy, sk_accuracy

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset, labels):
    separated = {}
    for i in range(len(dataset)):
        data_point = dataset[i]
        class_value = labels[i]
        separated[class_value] = separated.get(class_value, []) + [data_point]
    return separated

# Calculate the mean, stdev and count for each column of iris dataset
def get_stats_dataset(data):
    mean_features = np.mean(data, axis=0).reshape(len(data[0]), 1)
    stdev_features = np.std(data, axis=0).reshape(len(data[0]), 1)
    len_features = np.array([len(data[:, 0]), len(data[:, 1]), len(data[:, 2]), len(data[:, 3])]).reshape(len(data[0]), 1)
    stats = np.hstack([mean_features, stdev_features, len_features])
    return stats

# Split dataset by class then calculate statistics for each row
def get_stats_by_class(dataset, labels):
    separated = separate_by_class(dataset, labels)
    stats = {}
    for class_value, data in separated.items():
        stats[class_value] = get_stats_dataset(np.array(data))
    return stats

# Calculate the Gaussian probability distribution function for x
def calc_gaussian_prob(x, mu, sigma):
    exponent = exp(-((x-mu)**2 / (2 * sigma**2 )))
    return (1 / (sqrt(2 * pi) * sigma)) * exponent

# Calculate class probabilities for a single data point
def calc_class_probs(stats, data_point):
    total_rows = sum([stats[label][0][2] for label in stats])
    probs = dict()
    for class_value, class_stats in stats.items():
        probs[class_value] = stats[class_value][0][2] / float(total_rows)
        for i in range(len(class_stats)):
            mu, sigma, _ = class_stats[i]
            probs[class_value] *= calc_gaussian_prob(data_point[i], mu, sigma)
    return probs

# Predict class for a single data point
def predict(stats, data_point):
    probs = calc_class_probs(stats, data_point)
    best_label, best_prob = None, -1
    for predicted_class, prob in probs.items():
        if best_label is None or prob > best_prob:
            best_prob = prob
            best_label = predicted_class
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, train_labels, test):
    stats = get_stats_by_class(train, train_labels)
    predictions = list()
    for row in test:
        output = predict(stats, row)
        predictions.append(output)
    return(predictions)

In [18]:
# Run Naive Bayes algorithm
epochs = 10
scores, sk_scores = [], []
for i in range(epochs):
    print(f"Iteration {i+1}:")
    pred, sk_pred, acc, sk_acc = evaluate_algorithm(iris.data, iris.target)
    acc, sk_acc = float('%.3f' % acc), float('%.3f' % sk_acc)
    scores.append(acc)
    sk_scores.append(sk_acc)

    print(f"Custom predictions: {pred}")
    print(f"sklearn predictions: {sk_pred}")
    print(f"Accuracy: {acc * 100.0}%, sklearn Accuracy: {sk_acc * 100.0}%")
    print()

avg_accuracy = (sum(scores) / float(len(scores)))
sk_avg_accuracy = (sum(sk_scores) / float(len(sk_scores)))
print()
print(f"Custom Gaussian NB Average Accuracy: {avg_accuracy * 100.0:.3f}%")
print(f"sklearn Gaussian NB Average Accuracy: {sk_avg_accuracy * 100.0:.3f}%")
print("----------------------------------------------------------------------")
print("\n")

Iteration 1:
Custom predictions: [1, 2, 1, 0, 2, 1, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 1, 1, 2, 0, 1, 0, 0, 0, 0, 1, 0, 2, 2, 1]
sklearn predictions: [1 2 1 0 2 1 0 2 2 2 0 2 0 2 1 2 1 1 2 0 1 0 0 0 0 1 0 2 2 1]
Accuracy: 96.7%, sklearn Accuracy: 96.7%

Iteration 2:
Custom predictions: [1, 2, 2, 1, 1, 2, 0, 2, 0, 0, 1, 2, 2, 0, 2, 2, 1, 0, 0, 2, 1, 2, 0, 2, 1, 1, 0, 1, 0, 0]
sklearn predictions: [1 2 2 1 1 2 0 2 0 0 1 2 2 0 2 2 1 0 0 2 1 2 0 2 1 1 0 1 0 0]
Accuracy: 96.7%, sklearn Accuracy: 96.7%

Iteration 3:
Custom predictions: [1, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 1, 1, 0, 1, 0, 1, 2, 2, 1, 0, 0, 0, 1]
sklearn predictions: [1 2 0 2 2 2 2 0 0 0 2 1 0 2 2 2 1 1 1 0 1 0 1 2 2 1 0 0 0 1]
Accuracy: 96.7%, sklearn Accuracy: 96.7%

Iteration 4:
Custom predictions: [2, 2, 2, 2, 1, 2, 0, 0, 0, 1, 2, 1, 2, 2, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]
sklearn predictions: [2 2 2 2 1 2 0 0 0 1 2 1 2 2 2 0 1 0 1 0 1 0 1 0 2 0 1 1 0 1]
Accuracy: 100.0%, sklearn Accuracy: 100.0%

It