In [1]:
import numpy as np
import pandas as pd

In [11]:
from math import e, pi, sqrt

def matrix_measures(df):
    '''Creates a matrix of measures, each element in the matrix is an array of two elements:
    the first one is the mean and the second is the standart deviation.
    We have an array for each column in the data-frame
    '''
    matrix = []
    for i, column in enumerate(df.columns):
        if i == len(df.columns) - 1:
            return matrix
        matrix.append([ df[column].mean(), df[column].std() ])
        
        
def normal_distribution(c, mean, std):
    '''Normal distribution equation. '''
    if std == 0:
        return 0.000001
    return (1 / sqrt(2 * pi * std)) * (e ** (-0.5 * (((c - mean) / std)**2)))


def naive_bayes_training(df):
    '''
        df: dataframe of training
    '''
    n_columns = df.shape[1] # get the number of columns (rows x columns)
    classes = df.iloc[:, n_columns - 1] # select the last column (we assume it has the labels of the classes)
    classes = np.unique(classes) # get the classes, that is the different values
    
    df = np.array_split(df, len(classes)) # split the data frame in the number of classes
    
    # hash-table, the keys are the classes and the values are the matrix_measures (means and stds)
    measures = {}
    for i in range(len(df)):
        measures[i] = matrix_measures(df[i])
    
    return measures


def naive_bayes(measures, x):
    # array of probabilities, each per class 
    probabilities = []
    for k, v in measures.items():
        p = 1
        for i, c in enumerate(x):
            nd_result = normal_distribution(c, measures[k][i][0], measures[k][i][1])
            if (nd_result * p) < 0.01:
                p *= 1
            else:
                p *= nd_result
        probabilities.append(p)
            
    if probabilities[0] > probabilities[1]:
        return 1.0 # No spam
    else:
        return 2.0 # Spam

In [12]:
# set 1
training = pd.read_csv('./1/tr_ems_countvectorizer.csv', header=None)
testing = pd.read_csv('./1/te_ems_countvectorizer.csv', header=None)
n_columns = testing.shape[1]

In [13]:
measures = naive_bayes_training(training)

corrects = 0
for i in range(20):
    sample = testing.iloc[i, :n_columns - 1]
    result = naive_bayes(measures, sample)
    if result == testing.iloc[i, n_columns - 1]:
        corrects += 1
        
corrects, corrects/20

(5, 0.25)

In [14]:
corrects = 0
for i in range(len(training)):
    sample = training.iloc[i, :n_columns - 1]
    result = naive_bayes(measures, sample)
    if result == training.iloc[i, n_columns - 1]:
        corrects += 1
        
corrects, corrects/len(training)

(49, 0.49)

In [5]:
training.shape

(100, 2220)

In [6]:
len(training)

100

In [17]:
training.head(20).c

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2210,2211,2212,2213,2214,2215,2216,2217,2218,2219
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
