# 朴素贝叶斯法
## 算法详情
>输入：训练数据$T={(x_1,y_1),(x_2,y_2),\dots,(x_N,y_N)}$，其中$x_i=(x_i^{(1)},x_i^{(2)},\dots,x_i^{(n)})$，$x_i^{(j)}$是第i个样本的第j个特征，$x_i^{(j)}\in{a_{j1},a_{j2},\dots,a_{js_j}}$，$a_{jl}$是第j个特征可能取的第l个值，$j=1,2,\dots,n,l=1,2,\dots,S_j,y_i\in{c_1,c_2,\dots,c_k}$；实例x；

>输出：实例x的分类。

>（1）计算先验概率及条件概率

$$P(Y=c_k)=\frac{\sum^{N}_{i=1}{I(y_i=c_k)}}{N},k=1,2,\dots,K$$

$$P(X^{(j)}=a_{jl}|Y=c_k)=\frac{\sum^N_{i=1}{I(x_i^{(j)}=a_{jl},y_i=c_k)}}{\sum^N_{(i=1)}{I(y_i=c_k)}}　　j=1,2,\dots,n;　l=1,2,\dots,S_j;　k=1,2,\dots,K$$


>(2)对于给定的实例$x=(x^{1}),x=(x^{2}),\dots,x=(x^{n})^T$，计算
$$P(Y=c_k)\prod^n_{j=1}{P(X^{(j)}=x^{(j)}|Y=c_k), k=1,2\dots,K}$$
>(3)确定实例x的类
$$y=argmax_{c_{k}}P(Y=c_k)\prod^n_{j=1}P(X^{(j)}=x^{(j)}|Y=c_k)$$


In [1]:
# %load naive_bayes.py

import pandas as pd
import numpy as np
import cv2
import random
import time

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

# 二值化
def binaryzation(img):
    cv_img = img.astype(np.uint8)
    cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img)
    return cv_img

def Train(trainset,train_labels):
    prior_probability = np.zeros(class_num)                         # 先验概率
    conditional_probability = np.zeros((class_num,feature_len,2))   # 条件概率

    # 计算先验概率及条件概率
    for i in range(len(train_labels)):
        img = binaryzation(trainset[i])     # 图片二值化
        label = train_labels[i]

        prior_probability[label] += 1

        for j in range(feature_len):
            conditional_probability[label][j][img[j]] += 1

    # 将概率归到[1.10001]
    for i in range(class_num):
        for j in range(feature_len):

            # 经过二值化后图像只有0，1两种取值
            pix_0 = conditional_probability[i][j][0]
            pix_1 = conditional_probability[i][j][1]

            # 计算0，1像素点对应的条件概率
            probalility_0 = (float(pix_0)/float(pix_0+pix_1))*1000000 + 1
            probalility_1 = (float(pix_1)/float(pix_0+pix_1))*1000000 + 1

            conditional_probability[i][j][0] = probalility_0
            conditional_probability[i][j][1] = probalility_1

    return prior_probability,conditional_probability

# 计算概率
def calculate_probability(img,label):
    probability = int(prior_probability[label])

    for i in range(len(img)):
        probability *= int(conditional_probability[label][i][img[i]])

    return probability

def Predict(testset,prior_probability,conditional_probability):
    predict = []

    for img in testset:

        # 图像二值化
        img = binaryzation(img)

        max_label = 0
        max_probability = calculate_probability(img,0)

        for j in range(1,10):
            probability = calculate_probability(img,j)

            if max_probability < probability:
                max_label = j
                max_probability = probability

        predict.append(max_label)

    return np.array(predict)


class_num = 10
feature_len = 784

if __name__ == '__main__':

    print('Start read data')

    time_1 = time.time()

    raw_data = pd.read_csv('../data/train.csv',header=0)
    data = raw_data.values

    imgs = data[0::,1::]
    labels = data[::,0]

    # 选取 2/3 数据作为训练集， 1/3 数据作为测试集
    train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
    # print(train_features.shape
    # print(train_features.shape

    time_2 = time.time()
    print('read data cost ',time_2 - time_1,' second','\n')

    print('Start training')
    prior_probability,conditional_probability = Train(train_features,train_labels)
    time_3 = time.time()
    print('training cost ',time_3 - time_2,' second','\n')

    print('Start predicting')
    test_predict = Predict(test_features,prior_probability,conditional_probability)
    time_4 = time.time()
    print('predicting cost ',time_4 - time_3,' second','\n')

    score = accuracy_score(test_labels,test_predict)
    print("The accruacy socre is ", score)



Start read data
read data cost  4.435253858566284  second 

Start training
training cost  18.93708300590515  second 

Start predicting
predicting cost  123.64107203483582  second 

The accruacy socre is  0.833694083694
