In [4]:
import pandas as pd
import numpy as np

In [5]:
def matrix_measures(df):
    '''Creates a matrix of measures, each element in the matrix is an array of two elements:
    the first one is the mean and the second is the standart deviation.
    We have an array for each column in the data-frame
    '''
    matrix = []
    for i, column in enumerate(df.columns):
        if i == len(df.columns) - 1:
            return matrix
        matrix.append([ df[column].mean(), df[column].std() ])
    
    
def normal_distribution(c, mean, std):
    '''Normal distribution equation. '''
    from math import e, pi, sqrt
    return (1 / sqrt(2 * pi * std)) * (e ** (-0.5 * (((c - mean) / std)**2)))


def naive_bayes(df, x):
    '''
        df: dataframe,
        x: sample vector to clasify
    '''
    n_columns = df.shape[1] # get the number of columns (rows x columns)
    classes = df.iloc[:, n_columns - 1] # select the last column (we assume it has the labels of the classes)
    classes = np.unique(classes) # get the classes, that is the different values
    df = np.array_split(df, len(classes)) # split the data frame in the number of classes
    
    
    # hash-table, the keys are the classes and the values are the matrix_measures (means and stds)
    measures = {}
    for i in range(len(df)):
        measures[i] = matrix_measures(df[i])
    
    # array of probabilities, each per class 
    probabilities = []
    for k, v in measures.items():
        p = 1
        for i, c in enumerate(x):
            p *= normal_distribution(c, measures[k][i][0], measures[k][i][1])
        probabilities.append(p)
    
    max_p = probabilities[0]
    result = 0 # index of the class
    
    # find the max probabilitie
    for i, p in enumerate(probabilities):
        if p > max_p:
            max_p = p
            result = i
    # print the class
    print(classes[result])

In [4]:
# improve: calculate the measures matrix only one time in the case we want to classify a set of samples

df = pd.read_csv('./iris.data', header=None, encoding='utf-8')
x = df.iloc[108, :4] # sample to classify

In [14]:
naive_bayes(df, x)

Iris-virginica


In [5]:
df.head(30)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
