# 第五题：实现带有拉普拉斯修正的朴素贝叶斯

实验内容：
1. 叙述拉普拉斯修正的作用
2. 给出使用的数据集
3. 给出实现的代码，要有详细的注释
4. 给出模型评价指标的结果

## 处理二值化数据集

In [1]:
import numpy as np
spambase = np.loadtxt('data/spambase/spambase.data', delimiter = ",")
spamx = spambase[:, :57]
spamy = spambase[:, 57]

In [2]:
spamx_binary = (spamx != 0).astype('float64')

In [3]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(spamx_binary, spamy, test_size=0.3, random_state=32)
print(trainX, trainY, testX, testY)

[[ 1.  1.  1. ...,  1.  1.  1.]
 [ 0.  1.  0. ...,  1.  1.  1.]
 [ 0.  0.  1. ...,  1.  1.  1.]
 ..., 
 [ 0.  0.  0. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  1.  1.  1.]] [ 1.  0.  0. ...,  1.  1.  0.] [[ 0.  0.  0. ...,  1.  1.  1.]
 [ 0.  0.  1. ...,  1.  1.  1.]
 [ 1.  0.  1. ...,  1.  1.  1.]
 ..., 
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  1.  1.  1.]
 [ 0.  1.  1. ...,  1.  1.  1.]] [ 0.  1.  1. ...,  1.  0.  0.]


## 带有Laplace修正的伯努利分布朴素贝叶斯

In [4]:
class MyBernoulliNBwithLaplace():
    '''
    处理二值特征的朴素贝叶斯，带有拉普拉斯平滑。
    '''
    def __init__(self):
        '''
        初始化四个字典
        self.label_mapping     类标记 与 下标(int)
        self.probability_of_y  类标记 与 先验概率(float)
        self.probability_ofx_y 类标记 与 已知y的后验概率(dict)
        '''
        self.label_mapping = dict()
        self.probability_of_y = dict()
        self.probability_of_x_y = dict()
        
        
    def _clear(self):
        '''
        为了防止一个实例反复的调用fit方法，我们需要每次调用fit前，将之前学习到的参数删除掉
        '''
        self.label_mapping.clear()
        self.probability_of_y.clear()
        self.probability_of_x_y.clear()
    
    
    def fit(self, trainX, trainY):
        '''
        这里，我们要根据trainY内的类标记，针对每类，计算这类的先验概率，以及这类训练样本每个特征的均值和方差

        Parameters
        ----------
            trainX: np.ndarray, 训练样本的特征, 维度：(样本数, 特征数)
            trainY: np.ndarray, 训练样本的标记, 维度：(样本数, )
        '''
        
        # 先调用_clear
        self._clear()
        
        # 获取类标记
        labels = np.unique(trainY)
        
        # 添加类标记与下标的映射关系
        self.label_mapping = {label: index for index, label in enumerate(labels)}
        
        # 遍历每个类
        for label in labels:
            # 取出为label这类的所有训练样本，存为 x
            x = trainX[trainY == label, :]
            # 计算先验概率，用 x 的样本个数除以训练样本总个数，存储到 self.probability_of_y 中，键为 label，值为先验概率

#             Laplace
            self.probability_of_y[label] = (len(x) + 1) / (len(trainX) + len(labels))    
            self.probability_of_x_y[label] = (x.sum(axis=0) + 1) / (len(x) + len(labels)) # 二值匹配
#             Non_Laplace
#             self.probability_of_y[label] = (len(x) ) / (len(trainX))    
#             self.probability_of_x_y[label] = (x.sum(axis=0) ) / (len(x)) # 二值匹配

    def predict(self, testX):
        '''
        给定测试样本，预测测试样本的类标记，这里我们要实现化简后的公式

        Parameters
        ----------
            testX: np.ndarray, 测试的特征, 维度：(测试样本数, 特征数)
    
        Returns
        ----------
            prediction: np.ndarray, 预测结果, 维度：(测试样本数, )
        '''
        prediction = np.zeros(testX.shape[0], )
        
        for i in range(testX.shape[0]):
            # 枚举每一个样本
            
            mx = -1
            predict_label = None
            
            for label in self.label_mapping.keys():
                probability = 1
                for j in range(testX.shape[1]):
                    probability *= (self.probability_of_x_y[label][j] if testX[i][j] == 1 else 1-self.probability_of_x_y[label][j])
                
                if probability > mx:
                    mx = probability
                    predict_label = label
                
            prediction[i] = self.label_mapping[predict_label]
        return prediction

## 我自己实现的预测

In [5]:
cls = MyBernoulliNBwithLaplace()
cls.fit(trainX, trainY)
prediction = cls.predict(testX)
print(prediction)

[ 0.  1.  1. ...,  1.  0.  0.]


In [6]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

p = precision_score(testY, prediction)
a = accuracy_score(testY, prediction)
r = recall_score(testY, prediction)
f = f1_score(testY, prediction)
print(p,a,r,f)

0.884146341463 0.89283128168 0.826996197719 0.854616895874


## sklearn实现的预测

In [7]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(trainX, trainY)
prediction = clf.predict(testX)
print(prediction)

p = precision_score(testY, prediction)
a = accuracy_score(testY, prediction)
r = recall_score(testY, prediction)
f = f1_score(testY, prediction)

print(p,a,r,f)

[ 0.  1.  1. ...,  1.  0.  0.]
0.883817427386 0.887038377987 0.809885931559 0.845238095238
