In [7]:
import numpy as np
import pandas as pd
from sklearn.utils.multiclass import type_of_target
from collections import namedtuple

# naive Bayes classifier
$${h_{nb}} = \arg_{c \in y}\max P(c) {\prod^d_{i=1}} P({x_i} | c)$$
其中：
$d$为属性类别数，$x_i$为$x$在第$i$个属性上的取值

## 常规
$$P(c) = \frac{\left\vert D_c \right\vert}{\left\vert D \right\vert}$$
### 离散值
$$P({x_i} | c) = \frac{\left\vert D_{c, x_i} \right\vert}{\left\vert D_c \right\vert}$$
### 连续值
$$P({x_i} | c) = \frac{1}{\sqrt{2\pi} \sigma_{c,i}} \exp{\left( -\frac{\left( x_i - \mu_{c, i} \right)^2}{2\sigma_{c,i}^2} \right)}$$
## 拉普拉斯(Laplacian correction)修正
为了避免其他属性携带的信息被训练集中未出现的属性值抹去
$$P(c) = \frac{\left\vert D_c \right\vert + 1}{\left\vert D \right\vert + N}$$
### 离散值
$$P({x_i} | c) = \frac{\left\vert D_{c, x_i} \right\vert + 1}{\left\vert D_c \right\vert + N_i}$$
### 连续值
$$P({x_i} | c) = \frac{1}{\sqrt{2\pi} \sigma_{c,i}} \exp{\left( -\frac{\left( x_i - \mu_{c, i} \right)^2}{2\sigma_{c,i}^2} \right)}$$
其中：
$N$表示训练集$D$中可能的类别数，$N_i$表示第$i$个属性可能的取值数

In [8]:
def train_nb(X, y, method = 'common'):
#     根据所选的方法，确定拉普拉斯修正系数
    if method == 'common':
        l_1 = 0
        l_N = 0
    else:
        # 拉普拉斯平滑
        l_1 = 1
        l_N = 2
        
    m, n = X.shape
    p1 = (len(y[y == '是']) + l_1) / (m + l_N) 

    p1_list = []  # 用于保存正例下各属性的条件概率
    p0_list = []

    X1 = X[y == '是']
    X0 = X[y == '否']

    m1, _ = X1.shape
    m0, _ = X0.shape

    for i in range(n):
        xi = X.iloc[:, i]
        p_xi = namedtuple(X.columns[i], ['is_continuous', 'conditional_pro'])  # 用于储存每个变量的情况

        is_continuous = type_of_target(xi) == 'continuous'
        xi1 = X1.iloc[:, i]
        xi0 = X0.iloc[:, i]
        if is_continuous:  # 连续值时，conditional_pro 储存的就是 [mean, var] 即均值和方差
            xi1_mean = np.mean(xi1)
            xi1_var = np.var(xi1)
            xi0_mean = np.mean(xi0)
            xi0_var = np.var(xi0)

            p1_list.append(p_xi(is_continuous, [xi1_mean, xi1_var]))
            p0_list.append(p_xi(is_continuous, [xi0_mean, xi0_var]))
        else:  # 离散值时直接计算各类别的条件概率
            unique_value = xi.unique()  # 取值情况
            l_Ni = len(unique_value) if method == 'laplacian' else 0  # 根据所选的方法，确定拉普拉斯修正系数

            xi1_value_count = pd.Series(pd.value_counts(xi1), index=unique_value).fillna(0) + l_1  # 计算正样本中，该属性每个取值的数量
            xi0_value_count = pd.Series(pd.value_counts(xi1), index=unique_value).fillna(0) + l_1

            p1_list.append(p_xi(is_continuous, np.log(xi1_value_count / (m1 + l_Ni))))
            p0_list.append(p_xi(is_continuous, np.log(xi0_value_count / (m0 + l_Ni))))

    return p1, p1_list, p0_list

In [9]:
def predict_nb(x, p1, p1_list, p0_list):
    n = len(x)

    x_p1 = np.log(p1)
    x_p0 = np.log(1 - p1)
    for i in range(n):
        p1_xi = p1_list[i]
        p0_xi = p0_list[i]

        if p1_xi.is_continuous:
            mean1, var1 = p1_xi.conditional_pro
            mean0, var0 = p0_xi.conditional_pro
            x_p1 += np.log(1 / (np.sqrt(2 * np.pi) * var1) * np.exp(- (x[i] - mean1) ** 2 / (2 * var1 ** 2)))
            x_p0 += np.log(1 / (np.sqrt(2 * np.pi) * var0) * np.exp(- (x[i] - mean0) ** 2 / (2 * var0 ** 2)))
        else:
            x_p1 += p1_xi.conditional_pro[x[i]]
            x_p0 += p0_xi.conditional_pro[x[i]]

    if x_p1 > x_p0:
        return '是'
    else:
        return '否'

In [11]:
data_path = r'.\data_watermelon\watermelon3_0_Ch.csv'
data = pd.read_csv(data_path, index_col=0)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# 因为log的处理，普通的朴素贝叶斯分类器已经无法处理属性值不存在的情况了
# p1, p1_list, p0_list = train_nb(X, y) 
p1_l, p1_list_l, p0_list_l = train_nb(X, y, 'laplacian')

In [14]:
x_test = X.iloc[0, :]   # 书中测1 其实就是第一个数据

print(predict_nb(x_test, p1_l, p1_list_l, p0_list_l))

是
