In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris,load_breast_cancer,load_wine
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy.stats import norm

In [13]:
data = load_wine()
x, y, column_names = data['data'], data['target'], data['feature_names']
x = pd.DataFrame(x, columns = column_names)

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.33, random_state = 44)

In [26]:
x.shape

(178, 13)

In [23]:
x_train.shape

(119, 13)

In [27]:
y_train.shape

(119,)

In [24]:
x_val.shape

(59, 13)

In [28]:
y_val.shape

(59,)

### Sklearn Gaussian Implemetation

In [21]:
# Create a Naive Bayes classifier
clf = GaussianNB()

# Train the classifier using the training data
clf.fit(x_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(x_val)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9491525423728814


### From scratch Implementation

To implement from scratch, we will group by our train set by class and I will calculate the mean and standard deviation for each column grouped by class. Also, we will calculate the class prior probability, which is simply the number of class elements divided by the total number of elements in the train set.

In [15]:
means = x_train.groupby(y_train).apply(np.mean)
stds = x_train.groupby(y_train).apply(np.std)

probs = x_train.groupby(y_train).apply(lambda x: len(x)/len(x_train))

In [18]:
probs

0    0.319328
1    0.394958
2    0.285714
dtype: float64

In [16]:
means

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,13.738421,1.978158,2.441053,16.736842,107.631579,2.843421,3.009211,0.283684,1.926842,5.577632,1.07,3.14,1112.815789
1,12.227872,2.018085,2.206383,19.897872,94.510638,2.22766,2.062553,0.359149,1.652979,3.141915,1.045532,2.797447,512.744681
2,13.201765,3.329412,2.432941,21.176471,101.558824,1.643235,0.800882,0.434412,1.185,7.451176,0.683235,1.667059,651.470588


In [17]:
stds

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,0.47527,0.624984,0.252511,2.891951,10.466541,0.329579,0.4099,0.066428,0.413089,1.320963,0.105357,0.362992,241.097693
1,0.538469,1.088627,0.338786,3.188109,14.928015,0.563802,0.775438,0.131081,0.636489,1.015825,0.20654,0.520942,132.837361
2,0.496544,1.167509,0.190333,2.19586,11.285574,0.315667,0.305435,0.121054,0.396724,2.392096,0.111454,0.281921,113.659252


In [19]:
y_pred = []

for elem in range(x_val.shape[0]):
    p = {}
    
    for c in np.unique(y_train):
        p[c] = probs.iloc[c]
        for index, param in enumerate(x_val.iloc[elem]):
            p[c]*= norm.pdf(param, means.iloc[c, index], stds.iloc[c, index])
    y_pred.append(pd.Series(p).values.argmax())

In [20]:
print('Accuracy:',accuracy_score(y_val, y_pred))

Accuracy: 0.9491525423728814
