In [27]:
import numpy as np

In [28]:
def fit(xtrain,ytrain):
    result= {}
    class_values = set(ytrain)
    for current_class in class_values:
        result[current_class]={}
        result["total_data"] = len(ytrain)
        current_class_rows = (ytrain==current_class)
        xtrain_current = xtrain[current_class_rows]
        ytrain_current = ytrain[current_class_rows]
        num_features = xtrain.shape[1]
        result[current_class]["total_count"] = len(ytrain_current)
        for j in range(1,num_features+1):
            result[current_class][j]={}
            all_possible_values = set(xtrain[:,j-1])
            for current_value in all_possible_values:
                result[current_class][j][current_value] = (xtrain_current[:,j-1]==current_value).sum()
    return result

In [38]:
def probability(dictionary,x,current_class):
    output= np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1#for laplace correction
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())#for laplace correction
        current_xj_probability =np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output = output + current_xj_probability
    return output

In [39]:
def predictSinglePoint(dictionary,x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if(current_class == "total_data"):
            continue
        p_current_class = probability(dictionary,x,current_class)
        if(first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [40]:
def predict(dictionary,xtest):
    ypred = []
    for x in xtest:
        x_class = predictSinglePoint(dictionary,x)
        ypred.append(x_class)
    return ypred

In [41]:
def makelabelled(column):
    second_limit = column.mean()
    first_limit = second_limit/2
    third_limit = 1.5*second_limit
    for i in range(0,len(column)):
        if(column[i]<first_limit):
            column[i]=0
        elif(column[i]<second_limit):
            column[i]=1
        elif(column[i]<third_limit):
            column[i]=2
        else:
            column[i]=3
    return column

In [42]:
from sklearn import datasets
iris = datasets.load_iris()
X=iris.data
Y = iris.target

In [43]:
for i in range(0,X.shape[-1]):
    X[:,i] = makelabelled(X[:,i])

In [44]:
from sklearn import model_selection
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(X,Y,test_size = 0.25,random_state=0)

In [45]:
dictionary = fit(xtrain,ytrain)

In [46]:
ypred = predict(dictionary,xtest)

In [47]:
from sklearn.metrics import classification_report , confusion_matrix
print(classification_report(ytest,ypred))
print(confusion_matrix(ytest,ypred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [48]:
classes = dictionary.keys()

In [50]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(xtrain,ytrain)
ypred = clf.predict(xtest)
ypred = clf.predict(xtest)
print(classification_report(ytest,ypred))
print(confusion_matrix(ytest,ypred))

             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.76      1.00      0.86        16
          2       1.00      0.67      0.80         9

avg / total       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]
