In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', 
                  header=None, names=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type'])
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
df.shape

(214, 10)

In [4]:
df.isnull().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [5]:
x = df.drop(columns=['Type'])
y = df['Type']

In [6]:
y.value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)

In [8]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [9]:
y_pred = gnb.predict(x_test)

In [10]:
from sklearn.metrics import confusion_matrix  
cm = confusion_matrix(y_test, y_pred)  
cm

array([[17,  2,  1,  0,  0,  0],
       [18,  4,  0,  3,  0,  1],
       [ 1,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  2,  0,  1],
       [ 0,  0,  0,  0,  2,  1],
       [ 0,  0,  0,  0,  0, 11]], dtype=int64)

In [11]:
from sklearn.metrics import accuracy_score  
ac = accuracy_score(y_test, y_pred)  
ac

0.5538461538461539

__Apply RandomOverSampler to balance data__

In [12]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
x_os, y_os = ros.fit_resample(x, y)

In [13]:
y_os.value_counts()

1    76
2    76
3    76
5    76
6    76
7    76
Name: Type, dtype: int64

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_os, y_os, test_size=0.25, random_state=10)

In [15]:
gnb1 = GaussianNB()
gnb1.fit(x_train, y_train)

In [16]:
y_pred = gnb1.predict(x_test)

In [17]:
from sklearn.metrics import confusion_matrix  
cm = confusion_matrix(y_test, y_pred)  
cm

array([[11,  1,  4,  0,  0,  0],
       [ 8,  5,  6,  2,  1,  1],
       [ 3,  2, 17,  0,  1,  0],
       [ 0,  7,  0, 14,  0,  1],
       [ 0,  0,  0,  0, 12,  0],
       [ 3,  0,  0,  0,  0, 15]], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score  
ac = accuracy_score(y_test, y_pred)  
ac

0.6491228070175439

__Iris Dataset__

In [19]:
from sklearn.datasets import load_iris
df2 = load_iris(as_frame=True)
df2.frame.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [20]:
X = df2.data
y = df2.target

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [22]:
print(len(x_train))
print(len(x_test))

112
38


In [23]:
gnb2 = GaussianNB()
gnb2.fit(x_train,y_train)

In [24]:
y_pred = gnb2.predict(x_test)

In [25]:
ac = accuracy_score(y_test, y_pred)  
ac

0.8947368421052632