In [1]:
import pandas as pd
from sklearn.datasets import load_wine

wine = load_wine()

In [2]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [6]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [7]:
df = pd.DataFrame(wine.data, columns = wine.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [8]:
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x = df.drop('target', axis = 'columns')
y = df.target

In [11]:
x.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [12]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [14]:
len(x_train)

142

In [15]:
len(x_test)

36

In [16]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [17]:
nbg = GaussianNB()
nbm = MultinomialNB()

In [18]:
nbg.fit(x_train, y_train)
nbm.fit(x_train, y_train)

MultinomialNB()

In [19]:
nbg.score(x_test, y_test)

0.9444444444444444

In [20]:
nbm.score(x_test, y_test)

0.7222222222222222

In [21]:
# From this we see that Gaussian Naive Bayes performs better on this dataset

In [22]:
# Now let's dig deeper, by using K fold crss validation

In [37]:
from sklearn.model_selection import StratifiedKFold
skm = StratifiedKFold(n_splits = 10)

In [39]:
score_g = []
score_m = []

for train_i, test_i in skm.split(wine.data, wine.target):
    x_train, x_test, y_train, y_test = wine.data[train_i], wine.data[test_i], wine.target[train_i], wine.target[test_i]
    nbg.fit(x_train, y_train)
    nbm.fit(x_train, y_train)
    g = nbg.score(x_test, y_test)
    m = nbm.score(x_test, y_test)
    score_g.append(g)
    score_m.append(m)

In [42]:
score_g

[0.9444444444444444,
 1.0,
 1.0,
 0.9444444444444444,
 0.9444444444444444,
 1.0,
 1.0,
 0.9444444444444444,
 1.0,
 1.0]

In [43]:
score_m

[0.7222222222222222,
 0.8888888888888888,
 0.7777777777777778,
 0.7777777777777778,
 0.6666666666666666,
 0.9444444444444444,
 0.8333333333333334,
 0.9444444444444444,
 1.0,
 0.9411764705882353]

In [44]:
(sum(score_g))/(len(score_g))

0.9777777777777779

In [45]:
(sum(score_m))/(len(score_m))

0.8496732026143791

In [46]:
from sklearn.model_selection import cross_val_score

In [48]:
g = cross_val_score(GaussianNB(), x, y, cv = 10)

In [49]:
m = cross_val_score(MultinomialNB(), x, y, cv = 10)

In [50]:
g

array([0.94444444, 1.        , 1.        , 0.94444444, 0.94444444,
       1.        , 1.        , 0.94444444, 1.        , 1.        ])

In [51]:
m

array([0.72222222, 0.88888889, 0.77777778, 0.77777778, 0.66666667,
       0.94444444, 0.83333333, 0.94444444, 1.        , 0.94117647])

In [52]:
(sum(g))/(len(g))

0.9777777777777779

In [53]:
(sum(m))/(len(m))

0.8496732026143791

In [None]:
# Hence we see, even by k fold cross validations, Gaussian naive bayes was much more accurate than Multinomial naive bayes for the wine dataset