In [1]:
from sklearn.naive_bayes import GaussianNB
from NaiveBayesModel import NaiveBayesClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import f1_score, accuracy_score
from tqdm.auto import tqdm
from scipy.stats import shapiro

In [2]:
SEED = 42

In [3]:
def transform(selectors: dict):
    def get_encoded_val(val, selectors=selectors):
        assert val in selectors.keys(), '{} not in keys!'.format(val)
        return selectors[val]

    def transformation(data):
        new_data = list(map(get_encoded_val, data))
        return np.array(new_data)

    return transformation

In [4]:
data = load_iris()
X = data['data']
y = data['target']
encoding = data['target_names']

DECODING = {key: val for key, val in enumerate(encoding)}
ENCODING = {val: key for key, val in DECODING.items()}
encode_transform = transform(ENCODING) 
decode_transform = transform(DECODING)

y = decode_transform(y)
print(set(y))

{'virginica', 'setosa', 'versicolor'}


In [5]:
from sklearn.decomposition import PCA

new_X = np.divide(X, X.mean(axis=0))
pca = PCA(n_components=2)
new_X = pca.fit_transform(new_X)
new_X.shape

(150, 2)

In [6]:
new_X = X
new_data = np.concatenate([new_X, y.reshape(-1, 1)], axis=1)
columns = ['sep_len', 'sep_wid', 'pet_len', 'pet_wid', 'class']
pd_new_data = pd.DataFrame(new_data, columns=columns)

setosa = new_data[new_data[:, -1] == 'setosa']
setosa = pd.DataFrame(setosa, columns=columns)

versicolor = new_data[new_data[:, -1] == 'versicolor']
versicolor = pd.DataFrame(versicolor, columns=columns)

virginica = new_data[new_data[:, -1] == 'virginica']
virginica = pd.DataFrame(virginica, columns=columns)

# pd_new_data.to_csv('iris_pca_2.csv', sep= ' ', index=False)
# setosa.to_csv('setosa_pca_2.csv', sep= ' ', index=False)
# versicolor.to_csv('versicolor_pca_2.csv', sep= ' ', index=False)
# virginica.to_csv('virginica_pca_2.csv', sep= ' ', index=False)

In [7]:
def to_float(series: pd.Series) -> pd.Series:
    convert = lambda x: float(x)
    new_series = series.apply(convert)
    return new_series

def df_to_float(df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        try:
            df[column] = to_float(df[column])
        except:
            pass
    return df

In [8]:
setosa = df_to_float(setosa)
virginica = df_to_float(virginica)
versicolor = df_to_float(versicolor)

In [9]:
print('P-val: setosa:')
print(shapiro(X[y == 'setosa', 0]).pvalue, shapiro(X[y == 'setosa', 1]).pvalue, shapiro(X[y == 'setosa', 2]).pvalue, shapiro(X[y == 'setosa', 3]).pvalue)
print('P-val: versicolor:')
print(shapiro(X[y == 'versicolor', 0]).pvalue, shapiro(X[y == 'versicolor', 1]).pvalue, shapiro(X[y == 'versicolor', 2]).pvalue, shapiro(X[y == 'versicolor', 3]).pvalue)
print('P-val: virginica:')
print(shapiro(X[y == 'virginica', 0]).pvalue, shapiro(X[y == 'virginica', 1]).pvalue, shapiro(X[y == 'virginica', 2]).pvalue, shapiro(X[y == 'virginica', 3]).pvalue)

P-val: setosa:
0.45951762795448303 0.27151283621788025 0.054810501635074615 8.658647061565716e-07
P-val: versicolor:
0.46474334597587585 0.3379890024662018 0.15847881138324738 0.027278577908873558
P-val: virginica:
0.2583250105381012 0.1808987259864807 0.10977514833211899 0.08695416152477264


In [10]:
train_sizes = np.arange(1e-1, 0.95, 0.025)
metrics = dict(size=[], acc=[], f1=[])

In [11]:
pbar = tqdm(train_sizes, desc='Train size')
for train_size in pbar:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, shuffle=True, random_state=SEED, stratify=y)
    nb = NaiveBayesClassifier()
    encoding = nb.fit(X_train, y_train) 
    y_preds = nb.predict(X_test)

    acc = accuracy_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds, average='macro')

    metrics['size'].append(train_size)
    metrics['acc'].append(acc)
    metrics['f1'].append(f1)

Train size:   0%|          | 0/34 [00:00<?, ?it/s]

In [12]:
metrics = pd.DataFrame(metrics)
# metrics.to_csv('metrics_train.csv', sep=' ', index=False)

In [13]:
metrics

Unnamed: 0,size,acc,f1
0,0.1,0.955556,0.955556
1,0.125,0.962121,0.962116
2,0.15,0.945312,0.945729
3,0.175,0.935484,0.935734
4,0.2,0.950413,0.95061
5,0.225,0.965812,0.965812
6,0.25,0.938053,0.938586
7,0.275,0.944954,0.945195
8,0.3,0.933962,0.93422
9,0.325,0.95098,0.95097


In [14]:
train_size_1 = 0.225
train_size_2 = 0.825

In [15]:
def get_model(X_train: np.array, y_train: np.array) -> NaiveBayesClassifier:
    clf = NaiveBayesClassifier()
    clf.fit(X_train, y_train)
    return clf

def get_score(clf: NaiveBayesClassifier, score: sklearn.metrics, X_test: np.array, y_test: np.array, params: dict = dict()) -> float:
    y_preds = clf.predict(X_test)
    metrics = score(y_test, y_preds, **params)
    return metrics

In [16]:
params = dict(average='macro')

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=train_size_1, 
                                                    shuffle=True, 
                                                    random_state=SEED, 
                                                    stratify=y)
model_1 = get_model(X_train, y_train)

acc_1 = get_score(model_1, accuracy_score, X_test, y_test)
f1_1 = get_score(model_1, f1_score, X_test, y_test, params)

acc_1, f1_1

(0.9658119658119658, 0.9658119658119658)

In [18]:
model_1.mu, model_1.var

(array([[4.95454545, 3.36363636, 1.46363636, 0.21818182],
        [5.92727273, 2.7       , 4.24545455, 1.3       ],
        [6.70909091, 2.94545455, 5.78181818, 1.99090909]]),
 array([[0.07472727, 0.14654545, 0.02054545, 0.00963636],
        [0.37218182, 0.146     , 0.23272727, 0.032     ],
        [0.53490909, 0.16272727, 0.46163636, 0.08290909]]))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=train_size_2,
                                                    shuffle=True,
                                                    random_state=SEED,
                                                    stratify=y)
model_2 = get_model(X_train, y_train)

acc_2 = get_score(model_2, accuracy_score, X_test, y_test)
f1_2 = get_score(model_2, f1_score, X_test, y_test, params)

acc_2, f1_2

(0.9629629629629629, 0.9628482972136223)

In [None]:
model_1.mu, model_1.var

## Plot decision boundary

In [None]:
x1_min, x1_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
x2_min, x2_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
h = 1e-3

xx, yy = np.meshgrid(np.arange(x1_min, x1_max, h),
                     np.arange(x2_min, x2_max, h))

xx.shape, yy.shape

In [None]:
params = dict(average='macro')

X_train, X_test, y_train, y_test = train_test_split(X[:, [0, 2]], y, 
                                                    train_size=train_size_1, 
                                                    shuffle=True, 
                                                    random_state=SEED, 
                                                    stratify=y)
model_1 = get_model(X_train, y_train)

acc_1 = get_score(model_1, accuracy_score, X_test, y_test)
f1_1 = get_score(model_1, f1_score, X_test, y_test, params)

acc_1, f1_1

In [None]:
fig, axis = plt.subplots(figsize=(18, 5))
Z = model_1.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axis.contourf(xx, yy, Z)
axis.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.Paired)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=train_size_2,
                                                    shuffle=True,
                                                    random_state=SEED,
                                                    stratify=y)
model_2 = get_model(X_train, y_train)

acc_2 = get_score(model_2, accuracy_score, X_test, y_test)
f1_2 = get_score(model_2, f1_score, X_test, y_test, params)