# Cell 1: Importing Required Libraries
# LAB 06 (Naive Bayes Classifier & Basian Network)
Classifier Models: Naïve Bayes

OBJECTIVE:

To be able to understand the programatic implementation of Naive Bayes
To be able to use Python Libraries for Classifier Models.
To be able to use Classifier Models for supervised machine learning tasks of classification.

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


# Cell 2: Pre-processing Function

In [8]:
def pre_processing(df):
    """ Partitioning data into features and target """
    X = df.drop([df.columns[-1]], axis = 1)
    y = df[df.columns[-1]]
    return X, y


# Cell 3: Accuracy Function

In [9]:
def accuracy_score(y_true, y_pred):
    """ Accuracy = correct predictions / total predictions """
    return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)


# Cell 4: Naive Bayes Classifier

In [10]:
class NaiveBayes:
    """
    Bayes Theorem:
        P(c|x) = (P(x|c) * P(c)) / P(x)
    """

    def __init__(self):
        self.features = list
        self.likelihoods = {}
        self.class_priors = {}
        self.pred_priors = {}
        self.X_train = np.array
        self.y_train = np.array
        self.train_size = int
        self.num_feats = int

    def fit(self, X, y):
        self.features = list(X.columns)
        self.X_train = X
        self.y_train = y
        self.train_size = X.shape[0]
        self.num_feats = X.shape[1]

        for feature in self.features:
            self.likelihoods[feature] = {}
            self.pred_priors[feature] = {}

            for feat_val in np.unique(self.X_train[feature]):
                self.pred_priors[feature].update({feat_val: 0})

                for outcome in np.unique(self.y_train):
                    self.likelihoods[feature].update({feat_val+'_'+outcome:0})
                    self.class_priors.update({outcome: 0})

        self._calc_class_prior()
        self._calc_likelihoods()
        self._calc_predictor_prior()

    def _calc_class_prior(self):
        """ P(c) - Prior Class Probability """
        class_counts = self.y_train.value_counts().to_dict()
        for outcome, count in class_counts.items():
            self.class_priors[outcome] = count / self.train_size

    def _calc_likelihoods(self):
        """ P(x|c) - Likelihood """
        for feature in self.features:
            for feat_val in np.unique(self.X_train[feature]):
                for outcome in np.unique(self.y_train):
                    # count of feature value given class
                    count_feat_val_class = len(self.X_train[(self.X_train[feature] == feat_val) & (self.y_train == outcome)])
                    # count of class
                    count_class = len(self.y_train[self.y_train == outcome])

                    # likelihood with Laplace smoothing
                    self.likelihoods[feature][feat_val + '_' + outcome] = (count_feat_val_class + 1) / (count_class + len(np.unique(self.X_train[feature])))

    def _calc_predictor_prior(self):
        """ P(x) - Evidence """
        for feature in self.features:
            feat_vals = self.X_train[feature].value_counts().to_dict()
            for feat_val, count in feat_vals.items():
                self.pred_priors[feature][feat_val] = count/self.train_size

    def predict(self, X):
        """ Calculates Posterior probability P(c|x) """
        results = []
        X = np.array(X)

        for query in X:
            probs_outcome = {}
            for outcome in np.unique(self.y_train):
                prior = self.class_priors[outcome]
                likelihood = 1
                evidence = 1

                for feat, feat_val in zip(self.features, query):
                    likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
                    evidence *= self.pred_priors[feat][feat_val]

                posterior = (likelihood * prior) / (evidence)
                probs_outcome[outcome] = posterior

            result = max(probs_outcome, key = lambda x: probs_outcome[x])
            results.append(result)

        return np.array(results)


# Cell 5: Main Code Block – Load Dataset, Train Model, Test Queries

In [11]:
df = pd.read_csv("weather.txt", delimiter="\t")

# Preprocess
X, y = pre_processing(df)

# Train model
nb_clf = NaiveBayes()
nb_clf.fit(X, y)

# Evaluate
print("Train Accuracy: {}%".format(accuracy_score(y, nb_clf.predict(X))))

# Query 1
query = np.array([['Rainy','Mild', 'Normal', 't']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))

# Query 2
query = np.array([['Overcast','Cool', 'Normal', 't']])
print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))

# Query 3
query = np.array([['Sunny','Hot', 'High', 't']])
print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))


Train Accuracy: 92.86%
Query 1:- [['Rainy' 'Mild' 'Normal' 't']] ---> ['yes']
Query 2:- [['Overcast' 'Cool' 'Normal' 't']] ---> ['yes']
Query 3:- [['Sunny' 'Hot' 'High' 't']] ---> ['no']
