In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import metrics



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [3]:
df = pd.read_csv('D:/DS_Projects/Dataset_Bayes.csv')

In [4]:
df.head()

Unnamed: 0,Example No.,Color,Type,Origin,Stolen
0,1,Red,Sports,Domestic,Yes
1,2,Red,Sports,Domestic,No
2,3,Red,Sports,Domestic,Yes
3,4,Yellow,Sports,Domestic,No
4,5,Yellow,Sports,Imported,Yes


In [5]:
df['Color'].unique()

array(['Red', 'Yellow'], dtype=object)

In [6]:
df['Type'].unique()

array(['Sports', 'SUV'], dtype=object)

In [7]:
df['Origin'].unique()

array(['Domestic', 'Imported'], dtype=object)

In [8]:
df['Stolen'].unique()

array(['Yes', 'No'], dtype=object)

In [9]:
label_encoder = preprocessing.LabelEncoder()
df['Color']= label_encoder.fit_transform(df['Color'])
df['Type']= label_encoder.fit_transform(df['Type'])
df['Origin']= label_encoder.fit_transform(df['Origin'])
df['Stolen']= label_encoder.fit_transform(df['Stolen']) 

In [10]:
df.head()

Unnamed: 0,Example No.,Color,Type,Origin,Stolen
0,1,0,1,0,1
1,2,0,1,0,0
2,3,0,1,0,1
3,4,1,1,0,0
4,5,1,1,1,1


In [11]:
# We do not have any missing value as of now

In [12]:
# To use the bayes classifier

In [13]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [14]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

In [15]:
def naive_bayes_gaussian(df, X, Y):
    
    features = list(df.columns)[:-1]

    
    prior = calculate_prior(df, Y)

    Y_pred = []
   
    for x in X:
        
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

       
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [21]:
train, test = train_test_split(df, test_size=.2, random_state=41)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_gaussian(train, X=X_test, Y="Stolen")

  p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
  p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))


In [22]:
from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

[[0 1]
 [0 1]]
0.6666666666666666
