In [1]:
import numpy as np
import pandas as pd

In [2]:
# 常態分配機率密度函數
import numpy as np

def GaussDistribution(x, mean=0, sigma=1):
    return 1 / ((2* np.pi * (sigma**2))**0.5) * ((np.e)**(- ((x-mean)**2) / (2*((sigma)**2))))


GaussDistribution(1.645, 0, 1)

0.10311081109198142

In [3]:
# np.concatenate 用法

a = np.array([[1, 2, 3]])
b = np.concatenate((a, np.array([[2, 3, 4]])), axis=0)
b

array([[1, 2, 3],
       [2, 3, 4]])

In [4]:
# np.mean 用法

np.mean(b, axis=0)

array([1.5, 2.5, 3.5])

### 實作步驟

1. 透過訓練集計算每個分類的個數以及總數, 從而計算出P(C), C= 0, 1(看多少分類)

2. 透過訓練集計算每個分類中各類別的機率P(f|C), 根據離散以及連續處理方式不同。

3. 由1.2.可得P(C|x), 得出樣本x屬於C的機率。

In [5]:
def GaussDistribution(x, mean=0, sigma=1):
    return 1 / ((2* np.pi * (sigma**2))**0.5) * ((np.e)**(- ((x-mean)**2) / (2*((sigma)**2))))

class NaiveBayes():
#     def _GaussDistribution(self, x, mean=0, sigma=1):
#         return 1 / ((2* np.pi * (sigma**2))**0.5) * ((np.e)**(- ((x-mean)**2) / (2*((sigma)**2))))
    
    def __init__(self, distribution='Gauss'):
        if distribution == 'Gauss':
            self.feature_distribution = GaussDistribution
        else:
            pass ## 可以換別的
        
    def fit(self, x, y):
        """
        1. 透過訓練集計算每個分類的個數以及總數, 從而計算出P(C), C= 0, 1(看多少分類)
        2. 透過訓練集計算每個分類中各特徵屬於該類別的機率P(f|C), 根據離散以及連續處理方式不同。
        3. 由1.2.可得P(C|x), 得出樣本x屬於C的機率。
        """
        # 1. 透過訓練集計算每個分類的個數以及總數, 從而計算出P(C), C= 0, 1(看多少分類)
        self.n = len(x)    # 訓練集個數
        self.class_len = {}    # 取得各分類的數目
        for e in y:
            if e not in self.class_len:
                self.class_len[e] = 1
            else:
                self.class_len[e] += 1
        
        # 2. 透過訓練集計算每個分類中各特徵屬於該類別的機率P(f|C), 根據離散以及連續處理方式不同。
        self.class_data = {}    # 根據類別把資料先分好
        for e1, e2, in zip(x, y):
            if e2 not in self.class_data:
                self.class_data[e2] = np.array([e1])
            else:
                self.class_data[e2] = np.concatenate((self.class_data[e2], np.array([e1])), axis=0)   # 合併方法
        # 求取特類別特徵的mean, std , 之後用於高斯分配求取機率
        self.class_mean_std = {}
        for k, v in self.class_data.items():
            self.class_mean_std[k] = np.mean(v, axis=0), np.std(v, axis=0)
            # print(self.class_mean_std[k])
        
    
    
    
    def predict(self, x):
        # 3. 由1.2.可得P(C|x), 得出樣本x屬於C的機率。
        # 求取全部的, 再return 最大機率的class
        y_preds = []
        self.class_proba= {}
        for e in x:
            y_pred, p = 0, 0
            for c, v in self.class_mean_std.items():    # v: np.array, np.array
                temp_p = 1
                for f, mean, std in zip(e, v[0], v[1]):   # v[0]: means, v[1]: stds
                    temp_p *= self.feature_distribution(f, mean, std)
                temp_p *= (self.class_len[c] / self.n)
                if temp_p > p:
                    p = temp_p
                    y_pred = c
            y_preds.append(y_pred)
        
        return np.array(y_preds)
            
    
    def predict_proba(self, x):
        pass   # not yet
    
    

In [6]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
# x = pd.DataFrame(data['data'], columns=data.feature_names)
x = data['data']
y = data['target']

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=100)
print(x_train.shape)
print(x_test.shape)

(426, 30)
(143, 30)


In [8]:
# model build by my own

gauss_nb = NaiveBayes()
gauss_nb.fit(x_train, y_train)

In [9]:
y_pred = gauss_nb.predict(x_test)
y_pred

array([0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1])

In [10]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_pred, y_test)
acc

0.9440559440559441

In [13]:
# 利用sklearn套建來算
# 結果一樣, 演算法還原完成!!
# 結果幾乎一模一樣, seed改變acc有些微差距, 推測應該是浮點運算上的誤差。
# 可能是常態分配那邊


from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred_sklearn = nb.predict(x_test)
acc_sklearn = accuracy_score(y_pred_sklearn, y_test)
acc_sklearn

0.9440559440559441