In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [2]:
def loadData(filePath):
    data = pd.read_csv(filePath)
    threshold = 32
    if 'label' in data.columns:
        X = np.array(data.iloc[:,1:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        y = np.array(data.iloc[:,0])
        return X, y
    else:
        X = np.array(data.iloc[:,:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        return X

In [3]:
class Naive_Bayes:
    def __init__(self):
        pass
    def fit(self, X, y):
        tot_count = X.shape[0]
        pixel_count = X.shape[1]
        count_y = [(y==i).sum() for i in range(10)]
        # 先验概率(拉普拉斯平滑)
        self.Py = np.array([(1 + (y==i).sum()) / (10 + tot_count) for i in range(10)])
        # 条件概率(拉普拉斯平滑) Pxy[i][j][k][0] / Pxy[i][j][k][1] = P(X^j=a_{jk} | y=c_i)
        Pxy = np.array([[[[1,2 + count_y[i]] for k in range(2)] for j in range(pixel_count)] for i in range(10)])
        for i, xi in enumerate(X):
            yi = y[i]
            for j in range(pixel_count):
                Pxy[yi][j][xi[j]][0] += 1
        self.Pxy = np.array([[[[Pxy[i][j][k][0] / Pxy[i][j][k][1]] for k in range(2)] for j in range(pixel_count)] for i in range(10)])
    def predict(self, X):
        # 预测时为了防止连乘下溢，采用log相加
        y_pred = np.zeros(X.shape[0], dtype = np.int32)
        print('Start predict.')
        for i, xi in enumerate(X):
            if (i + 1) % 1000 == 0:
                print('processed %d/%d' % (i+1, X.shape[0]))
            #  由于取了log所以初始值为0而不是1
            y_prob = [0 for _ in range(10)]
            for j in range(10):
                y_prob[j] += np.log(self.Py[j])
                for k in range(xi.shape[0]):
                    y_prob[j] += np.log(self.Pxy[j][k][xi[k]])
            # 取后验概率最大的为分类结果
            y_pred[i] = np.array(y_prob).argmax()
        print('Predict finish.')
        return y_pred

In [4]:
trainFilePath = '../mnist/train.csv'
X, y = loadData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state= 42, stratify = y)
testFilePath = '../mnist/test.csv'
X_test = loadData(testFilePath)

In [5]:
start = time.time()
model = Naive_Bayes()
model.fit(X_train, y_train)
end = time.time()
print('training time :', end - start)

training time : 17.239861249923706


In [6]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print('predicting time :', end - start)

Start predict.
processed 1000/8400
processed 2000/8400
processed 3000/8400
processed 4000/8400
processed 5000/8400
processed 6000/8400
processed 7000/8400
processed 8000/8400
Predict finish.
predicting time : 144.98640298843384


In [7]:
acc = (y_pred==y_valid).sum() / y_valid.shape[0]
print('acc = %f' % (acc))

acc = 0.834762


In [8]:
start = time.time()
y_pred = model.predict(X_test)
end = time.time()
print('predicting time :', end - start)

Start predict.
processed 1000/28000
processed 2000/28000
processed 3000/28000
processed 4000/28000
processed 5000/28000
processed 6000/28000
processed 7000/28000
processed 8000/28000
processed 9000/28000
processed 10000/28000
processed 11000/28000
processed 12000/28000
processed 13000/28000
processed 14000/28000
processed 15000/28000
processed 16000/28000
processed 17000/28000
processed 18000/28000
processed 19000/28000
processed 20000/28000
processed 21000/28000
processed 22000/28000
processed 23000/28000
processed 24000/28000
processed 25000/28000
processed 26000/28000
processed 27000/28000
processed 28000/28000
Predict finish.
predicting time : 497.43361926078796


In [9]:
ans = pd.DataFrame({'ImageId': list(range(1, y_pred.shape[0] + 1)), 'Label' : y_pred})
ans.to_csv('./result.csv', index = False)
# threshold = 64  kaggle准确率 83.539%
# threshold = 32  kaggle准确率 83.392%
# https://www.kaggle.com/competitions/digit-recognizer/overview