In [1]:
import pandas as pd
import numpy as np

In [2]:
from math import e, pi, sqrt


def matrix_measures(df):
    '''Creates a matrix of measures, each element in the matrix is an array of two elements:
    the first one is the mean and the second is the standart deviation.
    We have an array for each column in the data-frame
    '''
    matrix = []
    for i, column in enumerate(df.columns):
        if i == len(df.columns) - 1:
            return matrix
        matrix.append([ df[column].mean(), df[column].std() ])
    
    
def normal_distribution(c, mean, std):
    '''Normal distribution equation. '''
    return (1 / sqrt(2 * pi * std)) * (e ** (-0.5 * (((c - mean) / std)**2)))

def naive_bayes_training(df):
    '''
        df: dataframe,
        x: sample vector to clasify
    '''
    n_columns = df.shape[1] # get the number of columns (rows x columns)
    classes = df.iloc[:, n_columns - 1] # select the last column (we assume it has the labels of the classes)
    classes = np.unique(classes) # get the classes, that is the different values
    df = np.array_split(df, len(classes)) # split the data frame in the number of classes
    
    
    # hash-table, the keys are the classes and the values are the matrix_measures (means and stds)
    measures = {}
    for i in range(len(df)):
        measures[i] = matrix_measures(df[i])
    return measures
    

def naive_bayes(measures, x):
    # array of probabilities, each per class 
    probabilities = []
    for k, v in measures.items():
        p = 1
        for i, c in enumerate(x):
            p *= normal_distribution(c, measures[k][i][0], measures[k][i][1])
        probabilities.append(p)
    
    max_p = probabilities[0]
    result = 0 # index of the class
    
    # find the max probabilitie
    for i, p in enumerate(probabilities):
        if p > max_p:
            max_p = p
            result = i
    return result

In [5]:
df = pd.read_csv('./iris.data', header=None)

In [6]:
# training dataset
setosa = df[:45]
versicolor = df[50:95]
virginica = df[100:145]
frames = [setosa, versicolor, virginica]
training = pd.concat(frames)

# testing dataset
setosa = df[45:50]
versicolor = df[95:100]
virginica = df[145:150]
frames = [setosa, versicolor, virginica]
testing = pd.concat(frames)

In [10]:
measures = naive_bayes_training(training)

corrects = 0
# testing 
for i in range(15):
    sample = testing.iloc[i, :4]
    result = naive_bayes(measures, sample)
    if result == 0 and testing.iloc[i, 4] == 'Iris-setosa':
        corrects += 1 
    if result == 1 and testing.iloc[i, 4] == 'Iris-versicolor':
        corrects += 1 
    if result == 2 and testing.iloc[i, 4] == 'Iris-virginica':
        corrects += 1 

corrects

15

In [18]:
measures = naive_bayes_training(training)
print(len(training))

training = df
corrects = 0
# testing 
for i in range(len(training)):
    sample = training.iloc[i, :4]
    result = naive_bayes(measures, sample)
    if result == 0 and training.iloc[i, 4] == 'Iris-setosa':
        corrects += 1 
    if result == 1 and training.iloc[i, 4] == 'Iris-versicolor':
        corrects += 1 
    if result == 2 and training.iloc[i, 4] == 'Iris-virginica':
        corrects += 1 

corrects, corrects/len(training) * 100

150


(144, 96.0)