# 逻辑回归手写数字分类
## 数据读取
数据集都是从kaggle上下载下来的，训练集共有42000行，每行代表一张手写数字的灰度图，每张灰度图都是28x28像素，也就是一张图一共有784像素点，每个点的值都是0-255。

In [20]:
import numpy as np

data = np.loadtxt("digit-recognizer/train.csv", delimiter=",", skiprows=1)

y = data[:,0]
X = data[:,1:]
m = len(y)

print(X.shape)
print(y.shape)

(42000, 784)
(42000,)


## 构建模型
使用逻辑回归的思路来解决分类问题。这里使用one Vs all的策略。

In [21]:
# 梯度下降算法 （公式和线性回归是相同的，只是计算y的方式改变）

# 初始化w = (w;b)
w = np.zeros((data.shape[1],10))

# X添加一列
X = np.column_stack((X, np.ones((m, 1))))

# 学习率
alpha = 0.00001

temp_y = np.zeros((m, 10))

for i in range(10):
    for j in range(m):
        if y[j] == i:
            temp_y[j, i] = 1

# 梯度下降
for i in range(10):
    print("now run ", i, "rows")

    temp_w = np.ones((data.shape[1],1))

    for j in range(10000):
        J = (-1.0) / m * (np.dot(np.transpose(temp_y[:,i]), np.log(1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))))) + np.dot(np.transpose(1 - temp_y[:,i]), np.log(1.0 - 1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))))))
        w[:,i] = w[:,i] - alpha / m * np.dot(np.transpose(X), (1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))) - temp_y[:,i]))

    print(J)

now run  0 rows
0.023736086806356492
now run  1 rows
0.022142318723784905
now run  2 rows
0.06744320593852289
now run  3 rows
0.08629708699389507
now run  4 rows
0.046429912092596205
now run  5 rows
0.08228636987124428
now run  6 rows
0.038354527112903235
now run  7 rows
0.044815802724220144
now run  8 rows
0.14650500057967106
now run  9 rows
0.10540511127190764


In [22]:
# 进行预测
import pandas as pd
test_data = np.loadtxt("digit-recognizer/test.csv", delimiter=",", skiprows=1)
test_m = test_data.shape[0]
test_data = np.column_stack((test_data, np.ones((test_m, 1))))

predict = np.zeros((test_m,1))
temp_predict = np.zeros((test_m, 10))

for i in range(10):
    temp_predict[:,i] = 1.0 / (1.0 + np.exp(-1 * np.dot(test_data, w[:,i])))

predict = np.argmax(temp_predict, axis=1)

np.savetxt("digit-recognizer/sample_submission.csv", predict, fmt="%d", delimiter=",")