In [2]:
import numpy as np

In [3]:
def fit(x_train,y_train):
    result = {}
    classes = set(y_train)
    for current_class in classes:
        result[current_class] = {}
        result["data_count"] = len(y_train)
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        y_train_current = y_train[current_class_rows]
        result[current_class]["total_count"] = len(y_train_current)
        num_features = x_train.shape[1]
        for i in range(1,num_features + 1):
            result[current_class][i] = {}
            possible_values_for_features = set(x_train[:,i - 1])
            for current_value in possible_values_for_features:
                result[current_class][i][current_value] = (x_train_current[:,i - 1] == current_value ).sum()
    return result


def probability(dictionary,x ,current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["data_count"])
    num_features = len(dictionary[current_class].keys()) - 1
    for i in range(1,num_features + 1):
        xi = x[i -1]
        count_current_class_with_xi = dictionary[current_class][i][xi] + 1
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][i].keys())
        probability_with_xi = np.log(count_current_class_with_xi) - np.log(count_current_class)
        output = output + probability_with_xi
    return output


def predict_single_point(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_gain = -1
    first_run = True
    for current_class in classes:
        if (current_class == "data_count"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class
        
    
    

def predict(dictionary, x_test):
    predictions = []
    for x in x_test:
        x_class = predict_single_point(dictionary, x)
        predictions.append(x_class)
    return predictions
        

In [4]:
def make_labelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    for i in range(0, len(column)):
        if (column[i] < first_limit):
            column[i] = 0
        elif (column[i] < second_limit):
            column[i] = 1
        elif (column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

from sklearn import model_selection
from sklearn import datasets
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

iris = datasets.load_iris()
x = iris.data
y = iris.target


for i in range(0,x.shape[-1]):
    x[:,i] = make_labelled(x[:,i])

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,random_state = 0)

dictionary = fit(x_train,y_train)
predictions1 = predict(dictionary,x_test)

print(classification_report(y_test,predictions1))
print(confusion_matrix(y_test,predictions1))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [5]:
import pickle