In [1]:
import numpy as np
import pandas as pd

# Naive Bayes Classifier 
It is a conditional probability model, with formula: <br>
$ P(C| x_1, x_2, x_3, ...) = \frac{P(C)P(X|C)}{P(X)}$ <br>
It is naive because we have naive assumption such that every pair of features are independent from each other given C.<br>
So we can rewrite the formula as: <br>
$ P(C| x_1, x_2, x_3, ...) = P(C)P(x_1|C)P(x_2|C)... = P(C)\prod^{n}_{i=1} P(x_i|C)$

In [2]:
class Naive_Bayes():
    """
    Naive Bayes classifer
    
    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """
    
    def __init__(self):
        """
        Some initializations, if neccesary
        """
        self.model_name = 'Naive Bayes'
    
    def fit(self, X_train, y_train):
        """ 
        The fit function fits the Naive Bayes model based on the training data. 
        Here, we assume that all the features are discrete features. 
        
        X_train is a matrix or 2-D numpy array, represnting training instances. 
        Each training instance is a feature vector. 

        y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.

        TODO: 1. Modify and add some codes to the following for-loop
                 to compute the correct prior distribution of all y labels.
              2. Make sure they are normalized to a distribution.
        """
        self.prior = dict()
        self.class_count = dict()
        
        for y in y_train:
            if f'Y = {y}' not in self.prior:
                self.prior[f'Y = {y}'] = 1
                self.class_count[f'Y = {y}'] = 1
            else:
                self.prior[f'Y = {y}'] += 1
                self.class_count[f'Y = {y}'] += 1

        for prior in self.prior:
            self.prior[prior] = (self.prior[prior] + 1) / (len(y_train) + len(set(y_train)))
            
        """
        TODO: 3. Modify and add some codes to the following for-loops
                 to compute the correct likelihood P(X_j | Y).
              4. Make sure they are normalized to distributions.
        """
        self.likelihood = dict()
        X_train = np.asarray(X_train)
        
        for x, y in zip(X_train, y_train):
            for j in range(len(x)):
                if f'X{j} = {x[j]} | Y = {y}' not in self.likelihood:
                    self.likelihood[f'X{j} = {x[j]} | Y = {y}'] = 2
                else:
                    self.likelihood[f'X{j} = {x[j]} | Y = {y}'] += 1
        
        """
        TODO: 5. Think about whether we really need P(X_1 = x_1, X_2 = x_2, ..., X_d = x_d)
                 in practice?
              6. Does this really matter for the final classification results?
        """
        
    def ind_predict(self, x: list):
        """ 
        Predict the most likely class label of one test instance based on its feature vector x.
 
        TODO: 7. Enumerate all possible class labels and compute the likelihood 
                 based on the given feature vector x. Don't forget to incorporate 
                 both the prior and likelihood.
              8. Pick the label with the higest probability. 
              9. How to deal with very small probability values, especially
                 when the feature vector is of a high dimension. (Hint: log)
              10. How to how to deal with unknown feature values?
        """
        
        ret, max_prob = None, float('-inf')
        for y in self.prior:
            prob = np.log(self.prior[y])
            for j in range(len(x)):
                key = f'X{j} = {x[j]} | {y}'
                if key not in self.likelihood:
                    denominator = self.class_count[y] + len(self.prior)
                    prob += np.log(1 / denominator)
                else:
                    numerator = self.likelihood[key]
                    denominator = self.class_count[y] + len(self.prior)
                    prob += np.log(numerator / denominator)

            if prob > max_prob:
                max_prob = prob
                ret = y[-1:]
        return ret
    
    def predict(self, X):
        """
        X is a matrix or 2-D numpy array, represnting testing instances. 
        Each testing instance is a feature vector. 
        
        Return the predictions of all instances in a list.

        TODO: 11. Revise the following for-loop to call ind_predict to get predictions.
        """                
        ret = []
        X = np.asarray(X)
        
        for x in X:
            result = self.ind_predict(x)
            ret.append(result)
            
        return ret

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name', 'left_weight', 'left_distance', 'right_weight', 'right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [4]:
data

Unnamed: 0,class_name,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [5]:
data.class_name.value_counts()

class_name
R    288
L    288
B     49
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = np.matrix(data.iloc[:,1:])
y = data.class_name

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=88)

In [7]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

In [8]:
# Overall Accuracy

sum(y_hat == y_test) / 207  # you should get something like 0.88

np.float64(0.8840579710144928)