In [1]:
import numpy as np

In [2]:
#Dictionary to easily access different count value required for Probability
def fit(x_train,y_train):
    result={};
    classes=set(y_train)
    result["total_data"]=len(y_train)
    for cur_class in classes:
        result[cur_class]={};
        x_train_rows=x_train[(y_train==cur_class)]
        y_train_rows=y_train[(y_train==cur_class)]
        result[cur_class]["total_count"]=len(y_train_rows)
        num_features=x_train.shape[1]
        for j in range(1,num_features+1):
            result[cur_class][j]={}
            possible_values=set(x_train[:,j-1])
            for cur_val in possible_values:
                result[cur_class][j][cur_val]=(x_train_rows[:,j-1]==cur_val).sum()
                
    return result

In [3]:
def probability(result,x,cur_class):
    output=np.log(result[cur_class]["total_count"])-np.log(result["total_data"])
    num_features=len(result[cur_class].keys())-1
    for j in range(1,num_features+1):
        feature=x[j-1]
        #+1 for laplace correction
        count=result[cur_class][j][feature]+1
        total_count=result[cur_class]["total_count"]+len(result[cur_class][j].keys())
        particular_feature_prob=np.log(count)-np.log(total_count)
        output=output+particular_feature_prob
    return output

In [4]:
def predictsingle(result,x):
    classes=result.keys()
    best_p=-1000
    best_class=-1
    for cur_class in classes:
        if(cur_class=="total_data"):
            continue
        val=probability(result,x,cur_class)
        if(val>best_p):
            best_p=val
            best_class=cur_class
        
    return best_class

In [5]:
def predict(result,x_test):
    y_pred=[]
    for x in x_test:
        y_pred.append(predictsingle(result,x))
    return y_pred

In [6]:
from sklearn import datasets
iris=datasets.load_iris()
x=iris.data
y=iris.target

In [7]:
def make_labelled(x):
    m=x.mean()
    second_limit=m
    first_limit=0.5*m
    third_limit=1.5*m
    for i in range(len(x)):
        if(x[i]<first_limit):
            x[i]=0
        elif(x[i]<second_limit):
            x[i]=1
        elif(x[i]<third_limit):
            x[i]=2
        else:
            x[i]=3
    return x

In [8]:
#convert continuous data into labelled data
columns=x.shape[1]
for i in range(columns):
    x[:,i]=make_labelled(x[:,i])

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [10]:
result=fit(x_train,y_train)

In [11]:
y_pred=predict(result,x_test)

In [12]:
from sklearn.metrics import classification_report,confusion_matrix

In [13]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

   micro avg       0.97      0.97      0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [18]:
#For Continuous Data we use Gaussion Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00         9

   micro avg       1.00      1.00      1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

[[13  0  0]
 [ 0 16  0]
 [ 0  0  9]]
