In [23]:
import numpy as np
import pandas as pd

In [248]:
def matrix_measures(df):
    '''Creates a matrix of measures, each element in the matrix is an array of two elements:
    the first one is the mean and the second is the standart deviation.
    We have an array for each column in the data-frame
    '''
    matrix = []
    for i, column in enumerate(df.columns):
        if i == len(df.columns) - 1:
            return matrix
        matrix.append([ df[column].mean(), df[column].std()])
        
        
def normal_distribution(c, mean, std):
    '''Normal distribution equation. '''
    from math import e, pi, sqrt
    if std == 0:
        return 1
    return (1 / sqrt(2 * pi * std)) * (e ** (-0.5 * (((c - mean) / std)**2)))


def naive_bayes(df, x):
    '''
        df: dataframe,
        x: sample vector to clasify
    '''
    n_columns = df.shape[1] # get the number of columns (rows x columns)
    classes = df.iloc[:, n_columns - 1] # select the last column (we assume it has the labels of the classes)
    classes = np.unique(classes) # get the classes, that is the different values

    df = np.array_split(df, len(classes)) # split the data frame in the number of classes
    
    
    # hash-table, the keys are the classes and the values are the matrix_measures (means and stds)
    measures = {}
    for i in range(len(df)):
        measures[i] = matrix_measures(df[i])
    
    
    # array of probabilities, each per class 
    probabilities = []
    for k, v in measures.items():
        p = 1
        for i, c in enumerate(x):
            nd_result = normal_distribution(c, measures[k][i][0], measures[k][i][1])
            if (nd_result * p) < 0.00000000000001:
                p *= 1
            else:
                p *= nd_result
        probabilities.append(p)
    
    #print(probabilities)
    max_p = probabilities[0]
    result = 0 # index of the class
    
    # find the max probabilitie
    for i, p in enumerate(probabilities):
        if p > max_p:
            max_p = p
            result = i
            
    print(probabilities)
    # print the class
    if classes[result] == 1.0:
        print('No spam')
    else:
        print('Spam')

In [249]:
training = pd.read_csv('./tr_ems_countvectorizer.csv', header=None)
test = pd.read_csv('./te_ems_countvectorizer.csv', header=None)

In [250]:
a = test.iloc[0, :n_columns-1] # sample to classify. No Spam
b = test.iloc[7, :n_columns-1] # sample to classify
c = test.iloc[13, :n_columns-1] # sample to classify
d = test.iloc[18, :n_columns-1] # sample to classify

naive_bayes(training, a), naive_bayes(training, b), naive_bayes(training, c), naive_bayes(training, d)  

[1.1180256078492951e-14, 1.0424909159211295e-14]
No spam
[1.0702028959107658e-14, 1.0414806384112498e-14]
No spam
[1.0853933724511517e-14, 1.1145978865083568e-14]
Spam
[1.0901846223285753e-14, 1.0335365362690083e-14]
No spam


(None, None, None, None)

In [217]:
for i in range(20):
    sample = test.iloc[i, n_columns-1]
    print(sample)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
