# Naive Bayes Implementation

In [1]:
import numpy as np

In [2]:
def fit(X_train , Y_train):
    
    # we will make a dictionary as decided for naive bayes(for storing count for probabilities)
    answer = {}
    
    # classes will contain all possible classes in our Y_train
    classes = set(Y_train)
    for class_target in classes:
        # class_target is the class for which we are storing the data first
        
        # we will need total number of data points to find .. 
        # ..probability so we will store it in our dictionary 
        answer["total_data"] = len(Y_train)
        
        # Every class will have a dictionary associated with it
        answer[class_target] = {}
        
        number_of_features = X_train.shape[1]
        
        # class_target_val will be a numpy of true/false corresponding to the ..
        # .. data points in Y_train having the predicted class as class_target
        class_target_val = (Y_train == class_target)
        
        # below two numpy arrays will fetch the rows for which we have true in ..
        # .. class_target_val
        X_train_val = X_train[class_target_val]
        Y_train_val = Y_train[class_target_val]
        
        #we will store total count of the data points referring to target class ..
        # .. for probability calculation
        answer[class_target]["total_count"] = len(Y_train_val)
        
        # Now we will iterate in the features
        for j in range(number_of_features):
            answer[class_target][j] = {}
            possible_values = set(X_train[:,j-1])
            for curr in possible_values:
                # the value below gives us the count of training data where the .. 
                # .. class is class_target and the value of Jth feature value is curr 
                answer[class_target][j][curr] = (X_train_val[:,j-1] == curr).sum()
    return answer
                
    

In [3]:
def prob(x , class_target , data_dict):
    class_probability = np.log(data_dict[class_target]["total_count"]) - np.log(data_dict["total_data"])
    feature_probability = 0
    number_of_features = len(data_dict[class_target].keys()) - 1
    for j in range(1,number_of_features):
        xx = x[j-1]
        count = data_dict[class_target][j][xx] + 1
        different_values_feature_takes = len(data_dict[class_target][j].keys())
        count_class = data_dict[class_target]["total_count"] + different_values_feature_takes
        probab = np.log(count) - np.log(count_class)
        feature_probability += probab 
    return class_probability + feature_probability

In [4]:
def predictSingleDataPoint(x , data_dict):
    classes = data_dict.keys()
    flag = True
    probability_predicted = -10000
    class_predicted = -1
    for class_target in classes:
        if(class_target == "total_data"):
            continue;
        # we need to find some probabilities here
        prob_current_class = prob(x , class_target , data_dict)
        if(prob_current_class > probability_predicted or flag):
            probability_predicted = prob_current_class
            class_predicted = class_target
        flag = False
    return class_predicted

In [5]:
def predict(X_test , data_dict):
    Y_pred = []
    for x in X_test:
        # class_X will contain predicted value for data point x
        class_x = predictSingleDataPoint(x , data_dict)
        Y_pred.append(class_x)
    return Y_pred
        

In [6]:
def makeDataLabelled(column):
    # We will
    mid_limit = column.mean()
    first_limit = mid_limit/2
    second_limit = mid_limit*1.5
    l = len(column)
    for i in range(0,l):
        if(column[i] < first_limit):
            column[i] = 0
        elif(column[i] < mid_limit):
            column[i] = 1
        elif(column[i] < second_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
from sklearn import model_selection
iris = datasets.load_iris()
X = iris.data
Y = iris.target

for i in range(0 , X.shape[-1]):
    X[:,i] = makeDataLabelled(X[:,i])
    
X_train , X_test , Y_train , Y_test = model_selection.train_test_split(X , Y , test_size = 0.25 , random_state = 0)
dic = fit(X_train , Y_train)
Y_pred = predict(X_test , dic)
from sklearn.metrics import classification_report , confusion_matrix
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.71      0.31      0.43        16
           2       0.39      0.78      0.52         9

    accuracy                           0.66        38
   macro avg       0.70      0.70      0.65        38
weighted avg       0.73      0.66      0.65        38

[[13  0  0]
 [ 0  5 11]
 [ 0  2  7]]


In [12]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
gnb.fit(X_train,Y_train)

MultinomialNB()

In [13]:
gnb.score(X_test , Y_test)

0.5263157894736842