In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
def get_data(filename):
  file = np.genfromtxt(filename, dtype='int32', delimiter=',')
  row_mask = (file != -1).all(axis=1) #usunięcie próbek z wartościami -1 (brak wartości pewnej cechy w datasecie)
  file = file[row_mask, :]
  return file

In [21]:
def compute_chi2_stats(X, y):
  chi2vals, pvals = chi2(X, y)
  return chi2vals, pvals

In [22]:
def select_k_best_via_chi2(k, X, y):
  X = SelectKBest(chi2, k).fit_transform(X, y)
  return X

In [23]:
def do_cross_validation(X, y, state): #do zmiany
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=state)
  return X_train, X_test, y_train, y_test

In [24]:
def LCG(arg): # linear congruential generator
  return (187668917 * arg + 11767183) % 2147483647
def get_next_random(arg=0, depth=0):
  if depth == 0:
    return LCG(arg)
  return get_next_random(LCG(arg), depth - 1)

In [25]:
def do_nth_cross_validation(X, y, n):
  X_train, X_test, y_train, y_test = do_cross_validation(X, y, get_next_random(42, n))
  return X_train, X_test, y_train, y_test

In [26]:
class NaiveBayesClassifier():
  def __init__(self):
    pass

  # liczymy prawdopodobienstwa a priori klas
  def calculate_prior(self, y_train):
    self.labels, self.counts = np.unique(y_train, return_counts=True)
    probabilities = self.counts / np.sum(self.counts)
    self.prior_probs = dict(zip(self.labels, probabilities))

  # liczymy parametry rozkładów w klasach
  def calculate_gaussian_parameters(self, X_train, y_train):
    self.gauss_params = {}
    for label in self.labels:
      mean_std_for_features = []
      for feature in range(X_train.shape[1]):
        class_indices = np.where(y_train == label)[0]
        feature_values_for_class = X_train[class_indices, feature]
        mean_std_tuple = np.mean(feature_values_for_class), np.std(feature_values_for_class)
        mean_std_for_features.append(mean_std_tuple)
      self.gauss_params[label] = mean_std_for_features
  
  # dopasowanie modelu do danych treningowych
  def fit(self, X_train, y_train):
    self.calculate_prior(y_train)
    self.calculate_gaussian_parameters(X_train, y_train)

  def predict(self, X_test):
    self.predicted_labels = np.zeros(X_test.shape[0])
    for sample in X_train:
      prediction = self.labels[0]
      MAP_value = -1
      for label in self.labels:
        Bayes_product = prior_probs[label]
        for i in range(len(feature)):
          Bayes_product *= 



In [27]:
data = get_data("/content/drive/My Drive/Colab Notebooks/breast-cancer-wisconsin.txt")
X = data[:, 1:10] # macierz cech, bez id i klas
y = data[:, 10]   # wektor klas
chi2val, pval = compute_chi2_stats(X, y) 
print(chi2val)
X_new  = select_k_best_via_chi2(5, X, y)
X_train, X_test, y_train, y_test = do_nth_cross_validation(X_new, y, 5)

NBC = NaiveBayesClassifier()
NBC.fit(X_train, y_train)
print(NBC.gaussian_parameters_for_classes)

[ 624.13570418 1370.06458731 1279.76770412  986.41787922  497.53676321
 1729.0661744   682.97823856 1143.8667119   228.99434634]
{2: [(1.3179723502304148, 0.8826539051554588), (1.3963133640552996, 0.9882187361442625), (1.336405529953917, 0.9940894365678004), (1.3870967741935485, 1.2870927318232261), (1.2764976958525345, 0.9729288251679139)], 4: [(6.274193548387097, 2.7425521128268975), (6.346774193548387, 2.642836803203384), (5.620967741935484, 3.303450644439802), (7.451612903225806, 3.275901350255788), (5.653225806451613, 3.3769721951884595)]}


In [11]:
for i in range(5):
  X_train, X_test, y_train, y_test = do_nth_cross_validation(X, y, i)
  print(X_train)
  print(y_train)

[[ 4  1  1 ...  2  1  1]
 [ 1  1  1 ...  5  1  1]
 [ 5  7  4 ...  7 10  3]
 ...
 [ 1  1  1 ...  3  1  1]
 [ 4  1  1 ...  2  1  1]
 [ 3  1  1 ...  2  1  1]]
[2 2 4 2 2 2 2 4 2 4 2 2 4 2 2 2 4 2 4 2 2 4 2 2 4 2 2 4 2 4 4 2 2 2 2 4 2
 2 2 2 2 2 2 4 4 2 2 2 2 4 2 2 4 2 4 4 4 4 2 4 4 4 4 2 4 2 2 4 4 2 4 2 2 4
 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 4 4 2 2 4 2 4 2 2 2 2 4 2 2
 4 2 4 4 2 4 2 2 4 2 2 4 2 4 2 2 2 4 2 2 2 2 4 4 4 4 4 4 2 2 2 2 2 4 2 2 2
 2 4 4 2 2 4 2 4 4 4 2 4 2 2 4 4 4 2 4 2 2 2 2 2 2 2 2 2 2 4 2 4 2 4 4 4 2
 4 4 2 4 2 2 2 4 4 2 4 2 2 2 4 2 2 4 4 2 2 4 2 2 4 2 2 2 4 4 4 2 4 2 2 2 2
 2 4 2 2 2 2 2 2 2 4 2 4 2 4 2 2 4 2 2 4 2 2 4 2 2 4 4 4 2 4 4 2 4 2 2 2 2
 2 2 4 2 4 2 2 4 2 2 2 4 2 4 4 2 4 2 2 4 2 2 4 2 4 2 2 4 2 2 2 2 4 4 4 2 4
 2 2 2 2 2 2 2 2 4 2 2 4 4 4 4 2 2 4 4 2 4 2 2 2 2 2 2 4 2 4 2 2 2 2 4 4 2
 4 4 2 2 2 2 2 2]
[[ 3  1  1 ...  1  1  1]
 [ 1  4  3 ...  5  6  1]
 [10  4  7 ...  6  1  1]
 ...
 [ 1  1  1 ...  3  1  1]
 [ 1  2  2 ...  1  1  1]
 [ 3  1  1 ...  5  8 

In [12]:
Z = np.array([1,2,1])
ind = np.where(Z==2)

In [13]:
ZZ = np.array([[4, 5, 6], [7,8,9], [10,11, 12]])

In [14]:
ZZ[1, :]

array([7, 8, 9])

In [15]:
Zsel = ZZ[0:2, 2]
Zsel.shape

(2,)

In [16]:
indices =np.where(Z == 1)[0]

In [17]:
ZZ[indices, 1].shape

(2,)