In [1]:
import numpy as np
from sklearn import datasets, model_selection

In [2]:
iris_data = datasets.load_iris()
X = iris_data.data
Y = iris_data.target

In [3]:
def discreteVal(x):
    for j in range(x.shape[-1]):
        mean_val = x[:,j].mean()
        for i in range(len(x[:,j])):
            if(x[i][j] < mean_val/2):
                x[i][j] = 0
            elif(x[i][j] < mean_val):
                x[i][j] = 1
            elif(x[i][j] < (3*mean_val/2)):
                x[i][j] = 2
            else:
                x[i][j] = 3
    return x

In [4]:
X_discrete = discreteVal(X)

In [5]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X_discrete, Y, test_size = 0.25, random_state = 0)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(112, 4) (38, 4)
(112,) (38,)


In [6]:
def fit(x,y):
    count_dict = {}
    count_dict["totalData"] = len(y)
    y_classes = set(y)
    for y_class in y_classes:
        count_dict[y_class] = {}        
        y_current_class = y[y == y_class]
        x_current_class = x[y == y_class]
        count_dict[y_class]["totalCount"] = len(y_current_class)
        n_features = x.shape[1]
        for j in range(1,n_features+1):
            count_dict[y_class][j] = {}
            xj_values = set(x[:,j-1])
            for xj_val in xj_values:
                count_dict[y_class][j][xj_val] = (x_current_class[:,j-1] == xj_val).sum()        
    return count_dict

In [7]:
def predict(count_dict, x_test):
    y_pred = []
    y_classes = count_dict.keys()
    for x in x_test:        
        max_prob = 0
        best_class = 0
        flag = True
        for y_class in y_classes:
            if(y_class == "totalData"):
                continue
            prob = np.log(count_dict[y_class]["totalCount"]/count_dict["totalData"])
            n_features = len(count_dict[y_class].keys()) - 1
            for j in range(1, n_features+1):
                xj_val = x[j-1]
                count_of_xj = count_dict[y_class][j][xj_val]+1
                p_xj_val = np.log(count_of_xj/(count_dict[y_class]["totalCount"]+len(count_dict[y_class][j].keys())))
                prob = prob + p_xj_val
            if(flag or (max_prob < prob)):
                max_prob = prob
                best_class = y_class
            flag = False
        y_pred.append(best_class)
    return y_pred

In [8]:
c_dict = fit(x_train,y_train)

In [9]:
y_pred = predict(c_dict, x_test)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

In [11]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

   micro avg       0.97      0.97      0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

