In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [2]:
def loadData(filePath):
    data = pd.read_csv(filePath)
    threshold = 64
    if 'label' in data.columns:
        X = np.array(data.iloc[:,1:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        y = np.array(data.iloc[:,0])
        return X, y
    else:
        X = np.array(data.iloc[:,:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        return X

In [3]:
class Naive_Bayes:
    def __init__(self):
        pass
    def fit(self, X, y):
        tot_count = X.shape[0]
        pixel_count = X.shape[1]
        count_y = [(y==i).sum() for i in range(10)]
        # 先验概率(拉普拉斯平滑)
        self.Py = np.array([(1 + (y==i).sum()) / (10 + tot_count) for i in range(10)])
        # 条件概率(拉普拉斯平滑) Pxy[i][j][k][0] / Pxy[i][j][k][1] = P(X^j=a_{jk} | y=c_i)
        Pxy = np.array([[[[1,2 + count_y[i]] for k in range(2)] for j in range(pixel_count)] for i in range(10)])
        for i, xi in enumerate(X):
            yi = y[i]
            for j in range(pixel_count):
                Pxy[yi][j][xi[j]][0] += 1
        self.Pxy = np.array([[[[Pxy[i][j][k][0] / Pxy[i][j][k][1]] for k in range(2)] for j in range(pixel_count)] for i in range(10)])
    def predict(self, X):
        # 预测时为了防止连乘下溢，采用log相加
        y_pred = np.zeros(X.shape[0], dtype = np.int32)
        print('Start predict.')
        for i, xi in enumerate(X):
            if (i + 1) % 1000 == 0:
                print('processed %d/%d' % (i+1, X.shape[0]))
            #  由于取了log所以初始值为0而不是1
            y_prob = [0 for _ in range(10)]
            for j in range(10):
                y_prob[j] += np.log(self.Py[j])
                for k in range(xi.shape[0]):
                    y_prob[j] += np.log(self.Pxy[j][k][xi[k]])
            # 取后验概率最大的为分类结果
            y_pred[i] = np.array(y_prob).argmax()
        print('Predict finish.')
        return y_pred

In [4]:
trainFilePath = '../mnist/train.csv'
X, y = loadData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state= 42, stratify = y)
testFilePath = '../mnist/test.csv'
X_test = loadData(testFilePath)

In [5]:
start = time.time()
model = Naive_Bayes()
model.fit(X_train, y_train)
end = time.time()
print('training time :', end - start)

training time : 17.140862464904785


In [6]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print('predicting time :', end - start)

Start predict.
processed 200/8400
processed 400/8400
processed 600/8400
processed 800/8400
processed 1000/8400
processed 1200/8400
processed 1400/8400
processed 1600/8400
processed 1800/8400
processed 2000/8400
processed 2200/8400
processed 2400/8400
processed 2600/8400
processed 2800/8400
processed 3000/8400
processed 3200/8400
processed 3400/8400
processed 3600/8400
processed 3800/8400
processed 4000/8400
processed 4200/8400
processed 4400/8400
processed 4600/8400
processed 4800/8400
processed 5000/8400
processed 5200/8400
processed 5400/8400
processed 5600/8400
processed 5800/8400
processed 6000/8400
processed 6200/8400
processed 6400/8400
processed 6600/8400
processed 6800/8400
processed 7000/8400
processed 7200/8400
processed 7400/8400
processed 7600/8400
processed 7800/8400
processed 8000/8400
processed 8200/8400
processed 8400/8400
Predict finish.
predicting time : 141.54728412628174


In [7]:
acc = (y_pred==y_valid).sum() / y_valid.shape[0]
print('acc = %f' % (acc))

acc = 0.834167


In [9]:
start = time.time()
y_pred = model.predict(X_test)
end = time.time()
print('predicting time :', end - start)

Start predict.
processed 200/28000
processed 400/28000
processed 600/28000
processed 800/28000
processed 1000/28000
processed 1200/28000
processed 1400/28000
processed 1600/28000
processed 1800/28000
processed 2000/28000
processed 2200/28000
processed 2400/28000
processed 2600/28000
processed 2800/28000
processed 3000/28000
processed 3200/28000
processed 3400/28000
processed 3600/28000
processed 3800/28000
processed 4000/28000
processed 4200/28000
processed 4400/28000
processed 4600/28000
processed 4800/28000
processed 5000/28000
processed 5200/28000
processed 5400/28000
processed 5600/28000
processed 5800/28000
processed 6000/28000
processed 6200/28000
processed 6400/28000
processed 6600/28000
processed 6800/28000
processed 7000/28000
processed 7200/28000
processed 7400/28000
processed 7600/28000
processed 7800/28000
processed 8000/28000
processed 8200/28000
processed 8400/28000
processed 8600/28000
processed 8800/28000
processed 9000/28000
processed 9200/28000
processed 9400/28000
pr

In [10]:
ans = pd.DataFrame({'ImageId': list(range(1, y_pred.shape[0] + 1)), 'Label' : y_pred})
ans.to_csv('./result.csv', index = False)
# kaggle准确率 83.539%
# https://www.kaggle.com/competitions/digit-recognizer/overview