In [22]:
import numpy as np

In [23]:
def fit(x_train, y_train):
    result = {}
    class_values = set(y_train) # Isse hum sb trh ki class values le skte hain kyunki apun ne set use kra hua mtlb basically jo answer hoega wo vli values
    for current_class in class_values:
        result[current_class] = {} # Isme humne unn classes ko features allot krdiye jo x_train ki headings thi na wo wli
        result["total_data"] = len(y_train)
        current_class_rows = (y_train == current_class) # Ye mere tb kaam aana jb merko uss current class ki total values chiye mtlb ki wo class kidr kidr aari
        x_train_current = x_train[current_class_rows] # isme hume vo x_train ki rows milengi jisme hmaari current class hogi
        y_train_current = y_train[current_class_rows]
        num_features = x_train.shape[1] # Isse hum saare features ka pta kr skte basically x_train ke columns ki count hai ye
        result[current_class]["total_count"] = len(y_train_current) # kitni baar ek class ki rows aayi hain 
        for j in range(1,num_features + 1):
            result[current_class][j] = {} # Ab isme jo apni uss feature ki values hai na jaise ki agr feature hai SALARY to values ho jaani Low, Mid, High
            all_possible_values = set(x_train[:,j - 1]) # isse maine uss current feature ki saari values nikaal li
            for current_value in all_possible_values:
                result[current_class][j][current_value] = (x_train_current[:,j - 1] == current_value).sum()
   
    return result

In [24]:
# hum log isliye use kr re kyunki bina log ke output ki values kaafi chhoti bhi ho skti hain
def probability(dictonary, x, current_class):
    output = np.log(dictonary[current_class]["total_count"]) - np.log(dictonary["total_data"]) # ye maine class ki probability nikaal li mtlb P(y = ai)
    num_features = len(dictonary[current_class].keys()) - 1 
    for j in range(1, num_features+1):
        xj = x[j - 1] # uss particular feature ki ye particular value eg salary ki high value
        count_current_class_with_value_xj = dictonary[current_class][j][xj] + 1 # +1 for LAPLACE CORRECTION
        count_current_class = dictonary[current_class]["total_count"] + len(dictonary[current_class][j].keys()) # isme v laplace lgaa hai aur + ke baad maine saare features ka count liya hai
        current_xj_probability = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        output = output + current_xj_probability
    return output    

In [25]:
def predictSinglePoint(dictonary, x):
    classes = dictonary.keys() # Merko saari classes chiye hogi na joki dict ki keys mai rkhi pdi hain
    best_p = -1000 # At last merko best prob chiye na uss x ke liye
    best_class = -1
    first_turn = True
    for current_class in classes:
        if(current_class == "total_data"):
            continue
        p_current_class = probability(dictonary,x,current_class)
        if(first_turn or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_turn = False
    return best_class

In [26]:
def predict(dictonary, x_test): # Ek predict function to bnana hee pdega na
    y_pred = [] 
    for x in x_test:
        x_class = predictSinglePoint(dictonary, x) # mai x_test ki hr value pe jaake uski probability nikaalunga 
        y_pred.append(x_class)
    return y_pred

In [27]:
# Kuch ni isme bss hum iris dataset ko categorise kr re kyunki apun ne saara model classes ke liye likha na isliye
# isme iris ke mean ke hisab se uske cols ko categorize kr re
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5*second_limit
    for i in range (0,len(column)):
        if (column[i] < first_limit):
            column[i] = 0
        elif (column[i] < second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column


In [28]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [29]:
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])

In [30]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.25,random_state=0)

In [31]:
dictionary = fit(X_train,Y_train)
Y_pred = predict(dictionary,X_test)

In [32]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [33]:
# kuch ni sun jaise apun ne iris ko mean ke hisaab se categorise kra tha na vaise hee ek sklearn ki inbuilt function hai jisko Gaussian Classifier bolte hain wo use kra hua
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]
