In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tabulate import tabulate
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import ttest_ind

In [3]:
def get_data(filename):
  file = np.genfromtxt(filename, dtype='int32', delimiter=',')
  row_mask = (file != -1).all(axis=1) #usunięcie próbek z wartościami -1 (brak wartości pewnej cechy w datasecie)
  file = file[row_mask, :]
  return file

In [4]:
class GaussianNaiveBayesClassifier(BaseEstimator, ClassifierMixin):
  def __init__(self):
    pass

  # liczymy prawdopodobienstwa a priori klas
  def calculate_prior(self, y_train):
    self.labels, self.counts = np.unique(y_train, return_counts=True)
    probabilities = self.counts / np.sum(self.counts)
    self.prior_probs = dict(zip(self.labels, probabilities))

  # liczymy parametry rozkładów w klasach
  def calculate_gaussian_parameters(self, X_train, y_train):
    self.gauss_params = {}
    num_of_features = X_train.shape[1]
    for label in self.labels:
      mean_std_for_features = []
      for i in range(num_of_features):
        class_indices = np.where(y_train == label)[0]
        feature_values_for_class = X_train[class_indices, i]
        mean_std_tuple = np.mean(feature_values_for_class), np.std(feature_values_for_class)
        mean_std_for_features.append(mean_std_tuple)
      self.gauss_params[label] = mean_std_for_features
  
  # dopasowanie modelu do danych treningowych
  def fit(self, X_train, y_train):
    self.calculate_prior(y_train)
    self.calculate_gaussian_parameters(X_train, y_train)

  def predict(self, X_test):
    num_of_samples = X_test.shape[0]
    num_of_features = X_test.shape[1]
    self.predicted_labels = np.zeros(num_of_samples)
    for number, sample in enumerate(X_test):
      prediction = self.labels[0]
      MAP_value = float('-inf')
      for label in self.labels:
        Bayes_log_sum = np.log(self.prior_probs[label]) - np.log(np.sqrt((2*np.pi)**num_of_features))
        for i in range(num_of_features):
          x=sample[i]
          mean=self.gauss_params[label][i][0]
          std=self.gauss_params[label][i][1]
          Bayes_log_sum -= 0.5*((x-mean)/std)**2 + np.log(std)
        
        if(Bayes_log_sum > MAP_value):
          MAP_value = Bayes_log_sum
          prediction = label
      
      self.predicted_labels[number] = prediction
    
    return self.predicted_labels

In [5]:
data = get_data("/content/drive/My Drive/Colab Notebooks/breast-cancer-wisconsin.txt")
X = data[:, 1:10] # macierz cech, bez id i klas
y = data[:, 10]   # wektor klas

# statystyki chi2 dla zbioru danych
chi2vals, pvals = chi2(X, y)

# porównanie zaimplementowanego klasyfikatora z bibliotecznym
clfsGNB = {
    'GNBC': GaussianNaiveBayesClassifier(),
    'GNB_lib': GaussianNB()
}
X_sel  = SelectKBest(chi2, 5).fit_transform(X, y)
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=42)
scores = np.zeros((len(clfsGNB), 10))

for fold_id, (train_index, test_index) in enumerate(rskf.split(X_sel,y)):
  X_train, X_test = X_sel[train_index], X_sel[test_index]
  y_train, y_test = y[train_index], y[test_index]
  for clf_id, clf_name in enumerate(clfsGNB):
    clf = clone(clfsGNB[clf_name])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores[clf_id, fold_id] = accuracy_score(y_pred, y_test)
np.save("/content/drive/My Drive/Colab Notebooks/GNBcomp", scores)

# słownik klasyfikatorów do testów statystycznych
clfs = {
    'GNBC': GaussianNaiveBayesClassifier(),
    'kNN': KNeighborsClassifier(),
    'CART': DecisionTreeClassifier(random_state=42),
}

#eksperymenty
n_features = 9 # liczba cech
n_splits = 2 # liczba foldów
n_repeats = 5 # liczba powtórzeń
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=2147483647)
scores3D = np.zeros((n_features, len(clfs), n_splits*n_repeats))

for k in range(1, n_features + 1):
  X_k = SelectKBest(chi2, k).fit_transform(X, y) # wybór liczby badanych cech
  for fold_id, (train_index, test_index) in enumerate(rskf.split(X_k, y)):
    X_train, X_test = X_k[train_index], X_k[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for clf_id, clf_name in enumerate(clfs):
      clf = clone(clfs[clf_name])
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      scores3D[k-1, clf_id, fold_id] = accuracy_score(y_test, y_pred)
np.save("/content/drive/My Drive/Colab Notebooks/CLFScomp", scores3D)

In [6]:
header_partitions = ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10"]
bayes_clfs = np.array([["Own Bayes"], ["Library Bayes"]])
bayes_comp_table = np.concatenate((bayes_clfs, scores), axis=1)
bayes_comp_table = tabulate(bayes_comp_table, header_partitions, floatfmt=".3f")
print("Porównanie dokładności własnego i bibliotecznego GNB dla 10 podziałów zestawu danych przy 5 cechach\n", bayes_comp_table)

Porównanie dokładności własnego i bibliotecznego GNB dla 10 podziałów zestawu danych przy 5 cechach
                   P1     P2     P3     P4     P5     P6     P7     P8     P9    P10
-------------  -----  -----  -----  -----  -----  -----  -----  -----  -----  -----
Own Bayes      0.962  0.956  0.953  0.959  0.965  0.956  0.956  0.965  0.959  0.962
Library Bayes  0.962  0.956  0.953  0.959  0.965  0.956  0.956  0.965  0.959  0.962


In [7]:
scores_means =np.transpose(np.mean(scores3D, axis=2))
scores_stds = np.transpose(np.std(scores3D, axis=2))
header_features = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
cols_clfs = np.array([["Bayes"], ["kNN"], ["CART"]])
scores_means_table = np.concatenate((cols_clfs, scores_means), axis=1)
scores_means_table = tabulate(scores_means_table, header_features, floatfmt=".3f")
scores_stds_table = np.concatenate((cols_clfs, scores_stds), axis=1)
scores_stds_table = tabulate(scores_stds_table, header_features, floatfmt=".3f")
print("Zestawienie wyników uzyskanych przez klasyfikatory w zależności od liczby cech\n\n",
      "Średnia jakości klasyfikacji:\n", scores_means_table,
      "\n\nOdchylenie standardowe jakości klasyfikacji:\n", scores_stds_table)

Zestawienie wyników uzyskanych przez klasyfikatory w zależności od liczby cech

 Średnia jakości klasyfikacji:
            1      2      3      4      5      6      7      8      9
-----  -----  -----  -----  -----  -----  -----  -----  -----  -----
Bayes  0.907  0.960  0.963  0.959  0.958  0.959  0.964  0.962  0.960
kNN    0.900  0.956  0.957  0.963  0.960  0.957  0.966  0.966  0.967
CART   0.909  0.947  0.944  0.947  0.943  0.942  0.946  0.949  0.945 

Odchylenie standardowe jakości klasyfikacji:
            1      2      3      4      5      6      7      8      9
-----  -----  -----  -----  -----  -----  -----  -----  -----  -----
Bayes  0.015  0.009  0.009  0.005  0.006  0.006  0.005  0.006  0.005
kNN    0.012  0.010  0.011  0.009  0.010  0.009  0.009  0.007  0.008
CART   0.011  0.011  0.013  0.008  0.010  0.010  0.014  0.012  0.013


In [8]:
scoresTTest = scores3D[6] # 7 cech daje najlepszą dokładność klasyfikacji; wybieramy odpowiednie foldy
print(scoresTTest)

[[0.9619883  0.96774194 0.95614035 0.96774194 0.97076023 0.96187683
  0.96783626 0.96480938 0.95614035 0.96774194]
 [0.97076023 0.95601173 0.96491228 0.97360704 0.97076023 0.97067449
  0.97368421 0.95307918 0.9502924  0.97360704]
 [0.9502924  0.95307918 0.93859649 0.94428152 0.96491228 0.95601173
  0.94736842 0.93548387 0.9122807  0.95601173]]


In [19]:
alpha = .05
t_statistic = np.zeros((len(clfs), len(clfs)))
p_value = np.zeros((len(clfs), len(clfs)))

for i in range(len(clfs)):
  for j in range(len(clfs)):
    t_statistic[i, j], p_value[i, j] = ttest_ind(scoresTTest[i], scoresTTest[j])

# t-statystyka oraz p-wartości
header_clfs = ["Bayes", "kNN", "CART"]
col_clfs = np.array([["Bayes"], ["kNN"], ["CART"]])
t_statistic_table = np.concatenate((col_clfs, t_statistic), axis=1)
t_statistic_table = tabulate(t_statistic_table, header_clfs, floatfmt=".3f")
p_value_table = np.concatenate((col_clfs, p_value), axis=1)
p_value_table = tabulate(p_value_table, header_clfs, floatfmt=".3f")
print("t-statistic:\n", t_statistic_table, "\n\np-value:\n", p_value_table)

# macierz przewagi - który z pary osiągnął lepszą jakość klasyfikacji
advantage = np.zeros((len(clfs), len(clfs)))
advantage[t_statistic > 0] = 1
advantage_table = tabulate(np.concatenate((col_clfs, advantage), axis=1), header_clfs)
print("\nAdvantage:\n", advantage_table)

# różnice statystycznie znaczące - macierz istotności
# H0 - między klasyfikatorami nie ma istotnej różnicy statystycznej
# p-value <= alpha - odrzucamy H0
significance = np.zeros((len(clfs), len(clfs)))
significance[p_value <= alpha] = 1
significance_table = tabulate(np.concatenate((col_clfs, significance), axis=1), header_clfs)
print("\nStatistical significance (alpha = 0.05):\n", significance_table)

# statystycznie znacząco lepsze klasyfikatory
statistically_better = advantage * significance
statistically_better_table = tabulate(np.concatenate((cols_clfs, statistically_better), axis=1), header_clfs)
print("\nStatistically significantly better:\n", statistically_better_table)

t-statistic:
          Bayes     kNN    CART
-----  -------  ------  ------
Bayes    0.000  -0.440   3.758
kNN      0.440   0.000   3.642
CART    -3.758  -3.642   0.000 

p-value:
          Bayes    kNN    CART
-----  -------  -----  ------
Bayes    1.000  0.665   0.001
kNN      0.665  1.000   0.002
CART     0.001  0.002   1.000

Advantage:
          Bayes    kNN    CART
-----  -------  -----  ------
Bayes        0      0       1
kNN          1      0       1
CART         0      0       0

Statistical significance (alpha = 0.05):
          Bayes    kNN    CART
-----  -------  -----  ------
Bayes        0      0       1
kNN          0      0       1
CART         1      1       0

Statistically significantly better:
          Bayes    kNN    CART
-----  -------  -----  ------
Bayes        0      0       1
kNN          0      0       1
CART         0      0       0
