In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [13]:
def get_data(filename):
  file = np.genfromtxt(filename, dtype='int32', delimiter=',')
  row_mask = (file != -1).all(axis=1) #usunięcie próbek z wartościami -1 (brak wartości pewnej cechy w datasecie)
  file = file[row_mask, :]
  return file

In [17]:
class NaiveBayesClassifier():
  def __init__(self):
    pass

  # liczymy prawdopodobienstwa a priori klas
  def calculate_prior(self, y_train):
    self.labels, self.counts = np.unique(y_train, return_counts=True)
    probabilities = self.counts / np.sum(self.counts)
    self.prior_probs = dict(zip(self.labels, probabilities))

  # liczymy parametry rozkładów w klasach
  def calculate_gaussian_parameters(self, X_train, y_train):
    self.gauss_params = {}
    num_of_features = X_train.shape[1]
    for label in self.labels:
      mean_std_for_features = []
      for i in range(num_of_features):
        class_indices = np.where(y_train == label)[0]
        feature_values_for_class = X_train[class_indices, i]
        mean_std_tuple = np.mean(feature_values_for_class), np.std(feature_values_for_class)
        mean_std_for_features.append(mean_std_tuple)
      self.gauss_params[label] = mean_std_for_features
  
  # dopasowanie modelu do danych treningowych
  def fit(self, X_train, y_train):
    self.calculate_prior(y_train)
    self.calculate_gaussian_parameters(X_train, y_train)

  def predict(self, X_test):
    num_of_samples = X_test.shape[0]
    num_of_features = X_test.shape[1]
    self.predicted_labels = np.zeros(num_of_samples)
    for number, sample in enumerate(X_test):
      prediction = self.labels[0]
      MAP_value = float('-inf')
      for label in self.labels:
        Bayes_log_sum = np.log(self.prior_probs[label])
        for i in range(num_of_features):
          x=sample[i]
          mean=self.gauss_params[label][i][0]
          std=self.gauss_params[label][i][1]
          Bayes_log_sum += -0.5*((x-mean)/std)**2 + np.log(1/(std * np.sqrt(2*np.pi)))
        
        if(Bayes_log_sum > MAP_value):
          MAP_value = Bayes_log_sum
          prediction = label
      
      self.predicted_labels[number] = prediction
    
    return self.predicted_labels

In [27]:
data = get_data("/content/drive/My Drive/Colab Notebooks/breast-cancer-wisconsin.txt")
X = data[:, 1:10] # macierz cech, bez id i klas
y = data[:, 10]   # wektor klas

# statystyki chi2 dla zbioru danych
chi2vals, pvals = chi2(X, y) 
print(chi2vals)


# porównanie zaimplementowanego klasyfikatora z bibliotecznym
X_new  = SelectKBest(chi2, 5).fit_transform(X, y)
X_train, X_test, y_train, y_test = do_nth_cross_validation(X_new, y, 7)

NBC = NaiveBayesClassifier()
NBC.fit(X_train, y_train)
prediction = NBC.predict(X_test)
print(accuracy_score(y_test, prediction))

clf = GaussianNB()
clf.fit(X_train, y_train)
library_prediction = clf.predict(X_test)
print(accuracy_score(y_test, library_predict))


#eksperymenty
scores_for_diff_num_of_features = []
for k in range(1, 10):
  X_k = SelectKBest(chi2, k).fit_transform(X, y) # wybór liczby badanych cech
  rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=2147483647)
  scores = []
  for train_index, test_index in rkf.split(X_k):
    X_train, X_test = X_k[train_index], X_k[test_index]
    y_train, y_test = y[train_index], y[test_index]
    NBC.fit(X_train, y_train)
    prediction = NBC.predict(X_test)
    scores.append(accuracy_score(y_test, prediction))

  scores_for_diff_num_of_features.append([np.round(np.mean(scores), 3), np.round(np.std(scores), 3)])

[ 624.13570418 1370.06458731 1279.76770412  986.41787922  497.53676321
 1729.0661744   682.97823856 1143.8667119   228.99434634]
0.9649122807017544
0.9649122807017544


In [28]:
print(np.array(scores_for_diff_num_of_features))

[[0.909 0.01 ]
 [0.959 0.009]
 [0.964 0.006]
 [0.96  0.005]
 [0.959 0.008]
 [0.96  0.007]
 [0.967 0.008]
 [0.965 0.008]
 [0.963 0.007]]
