In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [16]:
def get_data(filename):
  file = np.genfromtxt(filename, dtype='int32', delimiter=',')
  row_mask = (file != -1).all(axis=1) #usunięcie próbek z wartościami -1 (brak wartości pewnej cechy w datasecie)
  file = file[row_mask, :]
  return file

In [17]:
def compute_chi2_stats(X, y):
  chi2vals, pvals = chi2(X, y)
  return chi2vals, pvals

In [18]:
def select_k_best_via_chi2(k, X, y):
  X = SelectKBest(chi2, k).fit_transform(X, y)
  return X

In [19]:
def do_cross_validation(X, y, state): #do zmiany
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=state)
  return X_train, X_test, y_train, y_test

In [20]:
def LCG(arg): # linear congruential generator
  return (187668917 * arg + 11767183) % 2147483647
def get_next_random(arg=0, depth=0):
  if depth == 0:
    return LCG(arg)
  return get_next_random(LCG(arg), depth - 1)

In [21]:
def do_nth_cross_validation(X, y, n):
  X_train, X_test, y_train, y_test = do_cross_validation(X, y, get_next_random(42, n))
  return X_train, X_test, y_train, y_test

In [22]:
class NaiveBayesClassifier():
  def __init__(self):
    pass

  # liczymy prawdopodobienstwa a priori klas
  def calculate_prior(self, y_train):
    self.labels, self.counts = np.unique(y_train, return_counts=True)
    probabilities = self.counts / np.sum(self.counts)
    self.prior_probs = dict(zip(self.labels, probabilities))

  # liczymy parametry rozkładów w klasach
  def calculate_gaussian_parameters(self, X_train, y_train):
    self.gauss_params = {}
    num_of_features = X_train.shape[1]
    for label in self.labels:
      mean_std_for_features = []
      for i in range(num_of_features):
        class_indices = np.where(y_train == label)[0]
        feature_values_for_class = X_train[class_indices, i]
        mean_std_tuple = np.mean(feature_values_for_class), np.std(feature_values_for_class)
        mean_std_for_features.append(mean_std_tuple)
      self.gauss_params[label] = mean_std_for_features
  
  # dopasowanie modelu do danych treningowych
  def fit(self, X_train, y_train):
    self.calculate_prior(y_train)
    self.calculate_gaussian_parameters(X_train, y_train)

  def predict(self, X_test):
    num_of_samples = X_test.shape[0]
    num_of_features = X_test.shape[1]
    self.predicted_labels = np.zeros(num_of_samples)
    for number, sample in enumerate(X_test):
      prediction = self.labels[0]
      MAP_value = float('-inf')
      for label in self.labels:
        Bayes_log_sum = np.log(self.prior_probs[label])
        for i in range(num_of_features):
          x=sample[i]
          mean=self.gauss_params[label][i][0]
          std=self.gauss_params[label][i][1]
          Bayes_log_sum += -0.5*((x-mean)/std)**2 + np.log(1/(std * np.sqrt(2*np.pi)))
        
        if(Bayes_log_sum > MAP_value):
          MAP_value = Bayes_log_sum
          prediction = label
      
      self.predicted_labels[number] = prediction



In [23]:
data = get_data("/content/drive/My Drive/Colab Notebooks/breast-cancer-wisconsin.txt")
X = data[:, 1:10] # macierz cech, bez id i klas
y = data[:, 10]   # wektor klas
chi2val, pval = compute_chi2_stats(X, y) 
print(chi2val)
X_new  = select_k_best_via_chi2(5, X, y)
X_train, X_test, y_train, y_test = do_nth_cross_validation(X_new, y, 7)

NBC = NaiveBayesClassifier()
NBC.fit(X_train, y_train)
NBC.predict(X_test)
print(y_test.shape)
print(accuracy_score(y_test, NBC.predicted_labels))

clf = GaussianNB()
clf.fit(X_train, y_train)
library_predict = clf.predict(X_test)
print(accuracy_score(y_test, library_predict))

[ 624.13570418 1370.06458731 1279.76770412  986.41787922  497.53676321
 1729.0661744   682.97823856 1143.8667119   228.99434634]
(342,)
0.9649122807017544
0.9649122807017544
