In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
raw_dataset = pd.read_csv("/data/toy_dataset.csv")

In [7]:
raw_dataset.head(10)

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No
5,6,Dallas,Female,36,50786.0,No
6,7,Dallas,Female,32,33155.0,No
7,8,Dallas,Male,39,30914.0,No
8,9,Dallas,Male,51,68667.0,No
9,10,Dallas,Female,30,50082.0,No


In [8]:
raw_dataset.describe()

Unnamed: 0,Number,Age,Income
count,150000.0,150000.0,150000.0
mean,75000.5,44.9502,91252.798273
std,43301.414527,11.572486,24989.500948
min,1.0,25.0,-654.0
25%,37500.75,35.0,80867.75
50%,75000.5,45.0,93655.0
75%,112500.25,55.0,104519.0
max,150000.0,65.0,177157.0


In [9]:
raw_dataset.dropna(inplace=True)
len(raw_dataset)

150000

In [10]:
raw_dataset.drop(columns=['Number'], inplace=True)
raw_dataset.columns

Index([&#39;City&#39;, &#39;Gender&#39;, &#39;Age&#39;, &#39;Income&#39;, &#39;Illness&#39;], dtype=&#39;object&#39;)

In [11]:
# categorization

def discretization(attr_list):

    sorted_list = sorted(attr_list)

    '''
    Our Discretization Method:

    Q 0            Q 1            Q 2            Q 3             Q 4
    0 % --------- 25 % --------- 50 % --------- 75 % --------- 100 % 
    
    score - 0             1               2              3

    '''

    attr_map = dict()
    multiple = len(attr_list) / 4
    next = multiple

    for index in range(4):
        attr_map[index] = sorted_list[int(next-1)] 
        next += multiple

    result = []

    for val in attr_list:
        for i in range(4):
            if val <= attr_map[i]:
                result.append(i)
                break
    
    return result


def categoric_encoder(attr_list, attr_map = {}):

    i = 0

    if len(attr_map) == 0:
        attr_set = set(attr_list)
        for a in attr_set:
            attr_map[a] = i
            i += 1
        
    result = []
    
    for a in attr_list:
        try:
            result.append(attr_map[a])
        except:
            print("'" + a + "'")
    
    return result

In [12]:
'''
Index(['City', 'Gender', 'Age', 'Income', 'Illness'], dtype='object')
We need to discretize age & income as they are continuous attribute 
encode rest of the categorical attributes
'''
dataset = raw_dataset.copy()

dataset['City'] = categoric_encoder(dataset['City'])
dataset['Gender'] = categoric_encoder(dataset['Gender'], {'Female': 0, 'Male': 1})
dataset['Age'] = discretization(dataset['Age'])
dataset['Income'] = discretization(dataset['Income'])
dataset['Illness'] = categoric_encoder(dataset['Illness'], {'Yes': 1, 'No': 0})

In [13]:
dataset.head()

Unnamed: 0,City,Gender,Age,Income,Illness
0,1,1,1,0,0
1,1,1,2,0,0
2,1,1,1,0,0
3,1,1,1,0,0
4,1,1,2,0,0


## Naive Bayes Algorithm

In [14]:
'''
P(A & B) = P(A | B) X P(B)  OR  P(B | A) X P(A)

Hence
    P(Class) P(Attributes | Class) = P(Class | Attributes) * P(Attributes)

    Therefore,
    
    P(Class) = [ P(Class | Attributes) * P(Attributes) ] / P(Attributes | Class)

    P(C) = [ P(C | A1, A2, ..,An) * P(A1, A2, ..,An)] / P(A1, A2, ..,An | C)

    P(C) = P(C | A1) * P(C | A2) * ... * P(C | An) * P(A1) * P(A2) * ... * P(An)
            __________________________________________________________________
                      
                      P(A1 | C) * P(A2 | C) * ... * P(An | C)

'''

class NaiveBayesAlgorithm:

    def __init__(self):
        self.count = {}
        self.P = {}
        self.classes = None

    def initialize_count(self, X, Y):

        for x, y in zip(X, Y):
            '''
            x = [a1, a2, ..., an] Y = [C]
            '''
            for i, a in enumerate(x):

                self.count['C' + str(y) + '|' + 'A' + str(i) + str(a)] = self.count.get('C' + str(y) + '|' + 'A' + str(i) + str(a), 0) + 1

                self.count['A' + str(i) + str(a) + '|' + 'C' + str(y)] = self.count.get('A' + str(i) + str(a) + '|' + 'C' + str(y), 0) + 1
                
                self.count['A' + str(i) + str(a) + '_'] = self.count.get('A' + str(i) + str(a) + '_', 0) + 1

            self.count['C' + str(y) + '_'] = self.count.get('C' + str(y) + '_', 0) + 1

        return

    def initialize_probability(self, n):

        for k in self.count.keys():
            
            if k[-1] != '_':
                self.P[k] = self.count[k] / self.count[k[k.find('|') + 1:] + '_']
            
            else:
                self.P[k] = self.count[k] / n
        
        return

            

    def fit(self, X, Y):

        self.classes = sorted( list( set( list( Y.flatten() ))))
        
        self.initialize_count(X, Y)

        self.initialize_probability(X.shape[0])

        return
    
    
    def numerator(self, c, x):

        result = 1

        for i, a in enumerate(x):
            result *= self.P['A' + str(i) + str(a) + '_']
            result *= self.P['C' + str(c) + '|' + 'A' + str(i) + str(a)]
        
        return result
        
    
    def denominator(self, c, x):

        result = 1

        for i, a in enumerate(x):
            result *= self.P['A' + str(i) + str(a) + '|' + 'C' + str(c)]
        
        return result
        

    def predict(self, x):

        best_class = self.classes[0]
        best_score = 0

        for c in self.classes:

            n = self.numerator(c, x)
            d = self.denominator(c, x)
            score = n/d

            if (score > best_score):
                best_class = c
                best_score = score
        
        return best_class, best_score
        

    def evaluate(self, X, Y):
        correct = 0
        total = X.shape[0]

        for x, y in zip(X, Y):

            c, _ = self.predict(x)

            if(c == y):
                correct += 1
        
        accuracy = correct / total
        return accuracy



In [15]:
data = dataset.to_numpy()
X = data[:, :-1]
Y = data[:,  -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [16]:
model = NaiveBayesAlgorithm()
model.fit(X_train, Y_train)

In [17]:
'''
Evaluating on the data it is trained
'''
model.evaluate(X_train, Y_train)

0.91865

In [18]:
'''
Evaluating on testing data
'''
model.evaluate(X_test, Y_test)

0.9207666666666666