In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
class Naive_Bayes():
    """
    
    Naive Bayes classifer
    
    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """
    
    def __init__(self):
        """
            Some initializations, if neccesary
        """
        
        self.model_name = 'Naive Bayes'
    

    def fit(self, X_train, y_train):
        """ 
        The fit function fits the Naive Bayes model based on the training data. 
        Here, we assume that all the features are **discrete** features. 

        X_train is a matrix or 2-D numpy array, represnting training instances. 
        Each training instance is a feature vector. 

        y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """
        """
        TODO: 1. Modify and add some codes to the following for-loop
                 to compute the correct prior distribution of all y labels.
              2. Make sure they are normalized to a distribution.
        """
        self.classes = np.unique(y_train)
        self.num_classes = len(self.classes)
        self.num_features = X_train.shape[1]
        
        # Compute prior distribution of all y labels
        self.prior = {f'Y = {y}': (y_train == y).sum() for y in self.classes}
        total = sum(self.prior.values())
        self.total = total
        self.prior = {k: v/total for k, v in self.prior.items()}
        
        """
            TODO: 3. Modify and add some codes to the following for-loops
                     to compute the correct likelihood P(X_j | Y).
                  4. Make sure they are normalized to distributions.
        """
        
        self.likelihood = dict()
        label_count = {f'{y}': (y_train == y).sum() for y in self.classes}
        for x, y in zip(X_train, y_train):
            for j in range(x.shape[1]):
                key = f'X{j} = {x[0,j]} | Y = {y}'
                if key in self.likelihood:
                    self.likelihood[key] += 1
                else:
                    self.likelihood[key] = 1
    
        for key in self.likelihood:
            y = key.split('Y = ')[-1]
            self.likelihood[key] /= label_count[y]
        
        return self.prior
        #return self.likelihood
        
        """
        
            TODO: 5. Think about whether we really need P(X_1 = x_1, X_2 = x_2, ..., X_d = x_d)
                     in practice?
                  6. Does this really matter for the final classification results?
        """
        ### Answers in the report
        
    def ind_predict(self, x: list):
        """ 
        Predict the most likely class label of one test instance based on its feature vector x.
        """

        """
            TODO: 7. Enumerate all possible class labels and compute the likelihood 
                     based on the given feature vector x. Don't forget to incorporate 
                     both the prior and likelihood.
                  8. Pick the label with the highest probability. 
                  9. How to deal with very small probability values, especially
                     when the feature vector is of a high dimension. (Hint: log)
                  10. How to how to deal with unknown feature values?
        """

        ret, max_prob = None, -float('inf')
        for y in self.prior.keys():
            prob = np.log(self.prior[y])
            for j in range(x.shape[1]):
                # Check if the feature value is known
                feature_name = f'X{j} = {x[0,j]} | {y}'
                if feature_name in self.likelihood:
                    prob += np.log(self.likelihood[feature_name])
                    #print(f'{prob}')
                else:
                    # Apply smoothing for unknown feature values
                    num_features = self.num_features
                    prob *= 1 / (self.prior[y]*self.total + num_features)
                    print('unknown')
            if prob > max_prob:
                max_prob = prob
                y_1 = y.split('Y = ')[-1]
                ret = y_1
        return ret

    
    
    def predict(self, X):

        """
        X is a matrix or 2-D numpy array, represnting testing instances. 
        Each testing instance is a feature vector. 

        Return the predictions of all instances in a list.
        """

        """
        TODO: 11. Revise the following for-loop to call ind_predict to get predictions.
        """

        ret = []
        for x in X:
            # Use ind_predict method to predict class label for each instance x
            pred = self.ind_predict(x)
            ret.append(pred)

        return ret


In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name','left_weight','left_distance','right_weight','right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [4]:
data.class_name.value_counts()

R    288
L    288
B     49
Name: class_name, dtype: int64

In [5]:
X = np.matrix(data.iloc[:,1:])
y = data.class_name
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 88)

In [6]:
y_train.value_counts()

L    202
R    184
B     32
Name: class_name, dtype: int64

In [7]:
classes = np.unique(y_train)

In [9]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)

{'Y = B': 0.07655502392344497,
 'Y = L': 0.48325358851674644,
 'Y = R': 0.44019138755980863}

In [10]:
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

In [11]:
y_hat

['R',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'R',
 'R',
 'R',
 'R',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'L',
 'R',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'L',
 'R',
 'R',
 'L',
 'L',
 'L',
 'L',
 'L',
 'R',
 'L',
 'R',
 'L',
 'L',
 'L',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'R',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'R',
 'R',
 'L',
 'L',
 'R',
 'L',
 'R',
 'L',
 'L',
 'L',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'L',
 'L',
 'R',
 'R',
 'R',
 'R',
 'R',
 'R',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'L',
 'L',
 'L',
 'R',
 'L',
 'R',
 'L',
 'L',
 'L',
 'L',
 'L',
 'L',
 'R',
 'L',
 'L',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'L',
 'R',
 'L',
 'L',
 'R',
 'L',
 'L',
 'R',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'R',
 'L',
 'R',
 'R',
 'R',
 'L',
 'L'

Overall Accuracy

In [12]:
sum(y_hat == y_test)/ 207  # you should get something like 0.88

0.8840579710144928

In [13]:
len(y_test)

207

In [14]:
len(y_hat)

207