# Naive Bayes Classifier

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
    #to split the dataset into training set an testing set
from sklearn import metrics
    #to calculate accuracy and precision of classification
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset

We used the `cars` dataset available from UCI Machine Learning, [link](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation). We used the `car.data` (a csv)  file, modified it by adding the labels to the first row, so that parsing in the data becomes easy, and renamed it as `car.csv`.
The dataset consists of some categorical variables describing used cars, and `label` indicates their current condition. More details of the attributes are available on the dataset page.

We read in the data below, and develop a general idea of the datatset.

In [None]:
df = pd.read_csv("car.csv")
df.head(3) # some sample rows

In [None]:
df.shape # 1728 rows, with 6 attributes and 1 label each

In [None]:
df.columns    # attributes

`label` is our target field. Lets have a look at the kind of labels.

`unacc` -> unacceptable  |  `acc`   -> acceptable  |  `vgood` -> very good  |  `good`  -> good

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

# Data Pre-processing

Since we want to use the naive bayes classifier for binary classification and since the number of `unacc` values are very large, we make the choice of bining the `acc`, `good`, `vgood` into a single category.

Next, for convinience, we will encode the label `unacc` as `0` and `acc`, `good`, `vgood` as `1`

In [None]:
labels = []
for i in df['label']:
    if i == 'unacc':
        labels.append(0)
    else: labels.append(1)
    #convert labels datatype into binary(0,1) format
    #this is done to easily calculate accuracy and precision

labels = pd.DataFrame(labels, columns=['label']) #converting the numpy array to a dataframe

Let's have a look at the new target label distribution

In [None]:
labels['label'].value_counts()

In [None]:
features = df.drop(["label"], axis=1) #dropping label column to make input argument for the splitting function
features.head(3)

# Train-Test Split

We opt for a standard 80-20, train-test split. Since our data is skewed, we do a stratified split so as to ensure an even distribution of +ve and -ve classes.

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,
                                                                            test_size=0.2,
                                                                            random_state=0,
                                                                            stratify=labels)

# resetting indexes so that they are in ascending order
train_features.reset_index(drop=True, inplace=True)
test_features.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
test_labels.reset_index(drop=True, inplace=True)

In [None]:
train_features.shape

In [None]:
test_features.shape

Ensuring we have an even +ve and -ve class distribution. In both train and test sets, we have an approx 2:1 ratio of -ve to +ve classes

In [None]:
train_labels['label'].value_counts()

In [None]:
test_labels['label'].value_counts()

We join the labels back to our train set, so as to make our algorithm implementation easier.

In [None]:
train = pd.concat([train_features, train_labels], axis=1)
train.head(3)

# Naive Bayes Classifier Implementation

In [None]:
class Naive_Bayes():
    def __init__(self, data):
        # last col of data is assumed to be the targets
        # encoded as 0 or 1
        # we will store the -ve and +ve instances separately
        self.data = (data[data.iloc[:, -1] == 0], data[data.iloc[:, -1] == 1])
        # calculate and store the priors
        self.priors = self._calc_priors()
        # calculate counts of different values, of different attributes
        # separately, for +ve and -ve
        # stored as a dict for easy querying
        self.counts = self._calc_counts()

    
    # calculates the prior probabilities ie P(1) and P(0) from the whole dataset
    def _calc_priors(self):
        total = len(self.data[0]) + len(self.data[1])
        pos_prior = len(self.data[1])/total
        neg_prior = len(self.data[0])/total
        return neg_prior, pos_prior
    
    # calculates count of occurences, of unique values, of each attribute
    # for each class separately
    def _calc_counts(self):
        result = []
        for class_idx in range(2):
            temp = {}
            data = self.data[class_idx]
            for i, col in data.iteritems():
                temp[i] = col.value_counts()
            result.append(temp)
        return tuple(result)
    
    # calculates probability for both classes, for each row in data
    def get_probs(self, data):
        # for storing the probabilities of each class, for each data point
        result = []
        for _, row in data.iterrows():
            # for the denominator of the bayes theorem
            denom = 1
            # will eventually store the actual probabilities
            probs = [1, 1]
            # iterate over all except label
            for attr in self.data[0].columns[0:-1]:
                value_counts = [0, 0]
                attr_total = [0, 0]
                cond_probs = [0, 0]
                # for each class, calculate conditional prob of attr
                for class_index in range(2):
                    if str(row[attr]) in self.counts[class_index][attr]:
                        value_counts[class_index] = self.counts[class_index][attr][row[attr]]
                    cond_probs[class_index] = value_counts[class_index] / self.data[class_index].shape[0]
                    if cond_probs[class_index] == 0:
                        cond_probs[class_index] = 1e-9
                    probs[class_index] *= (cond_probs[class_index])
                # multiplying, naive assumption, all are independent events
                denom *= (value_counts[0] + value_counts[1]) / (self.data[0].shape[0] + self.data[1].shape[0])
                if denom == 0:
                    denom = 1e-9
            
            for class_index in range(2):
                # bayes theorem
                probs[class_index] *= (self.priors[class_index]/denom)
            
            result.append(probs)
        return result
    
    # uesd for prediction, offloads calculation to get_probs
    # then checks which class has higher and returns that label
    def predict(self, data):
        probs = self.get_probs(data)
        labels = np.zeros(len(probs))
        for i, prob in enumerate(probs):
            if prob[0] > prob[1]:
                labels[i] = 0
            else:
                labels[i] = 1
        return labels

# Training

We'll train our classifier on the train data that we separated earlier.

In [None]:
train.shape

In [None]:
naive_bayes = Naive_Bayes(train)  #initialize class

# Testing

We'll test the model by obtaining predictions on our previously made test set and store the labels in `pred_labels`

In [None]:
test_features.shape

In [None]:
pred_labels = naive_bayes.predict(test_features)  #initialize method

# Results and Metrics

Our predicted values are skewed, but that is expected as our test data is skewed, similar to the dataset

In [None]:
test_labels['label'].value_counts()

In [None]:
np.unique(pred_labels, return_counts=True)

To get a better idea of which cases our model is getting wrong, we'll generate the confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(test_labels, pred_labels)

In [None]:
# for pretty plotting a confusion matrix
def plot_conf_matrix(cf):
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                   cf.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                        cf.flatten()/np.sum(cf)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
             zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf, annot=labels, fmt='', cmap='viridis')
    plt.show()

Interestingly, our simple model is getting most of the test points correct __Note__: The percentages are based on the total number of test points.

In [None]:
plot_conf_matrix(conf_mat)

The accuracy of our model is very good considering its a simple naive bayes

In [None]:
acc = metrics.accuracy_score(test_labels, pred_labels)
acc   #accuracy measure = true cases/total cases

In [None]:
prec = metrics.precision_score(test_labels, pred_labels)
prec   #precision measure = true positive/ total positive

In [None]:
rec = metrics.recall_score(test_labels, pred_labels)
rec   #recall measure = true positive/(true positive + false negative) 