# 逻辑回归手写数字分类
## 数据读取
数据集都是从kaggle上下载下来的，训练集共有42000行，每行代表一张手写数字的灰度图，每张灰度图都是28x28像素，也就是一张图一共有784像素点，每个点的值都是0-255。

In [43]:
import numpy as np

data = np.loadtxt("digit-recognizer/train.csv", delimiter=",", skiprows=1)

y = data[:,0]
X = data[:,1:]
m = len(y)

print(X.shape)
print(y.shape)

(42000, 784)
(42000,)


## 构建模型
使用逻辑回归的思路来解决分类问题。这里使用one Vs all的策略。

In [44]:
# 梯度下降算法 （公式和线性回归是相同的，只是计算y的方式改变）

# 初始化w = (w;b)
w = np.zeros((data.shape[1],10))

# X添加一列
X = np.column_stack((X, np.ones((m, 1))))

# 学习率
alpha = 0.00001

temp_y = np.zeros((m, 10))

for i in range(10):
    for j in range(m):
        if y[j] == i:
            temp_y[j, i] = 1

# 梯度下降
for i in range(10):
    temp = 0
    J = 1
    while abs(J - temp) > 0.000001:
        temp = J
        J = (-1.0) / m * (np.dot(np.transpose(temp_y[:,i]), np.log(1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))))) + np.dot(np.transpose(1 - temp_y[:,i]), np.log(1.0 - 1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))))))
        w[:,i] = w[:,i] - alpha / m * np.dot(np.transpose(X), (1.0 / (1.0 + np.exp(-1.0 * np.dot(X, w[:,i]))) - temp_y[:,i]))


0.6931471805599463
0.8796608399317722
0.45647070100040654
0.14027632750149271
0.1152268608619793
0.10701136699276954
0.1016289865330499
0.09747044217778943
0.09392524656906065
0.09082819376124962
0.08808956378239859
0.08564534680605944
0.0834466276364876
0.08145519965210102
0.07964071341973224
0.07797871084130362
0.07644922830352552
0.07503578475563721
0.07372463532440188
0.07250421188538
0.07136469764250526
0.07029769933745954
0.06929599164447778
0.06835331566241813
0.06746421845283739
0.06662392407771252
0.0658282290657343
0.06507341700831505
0.06435618827146365
0.06367360175244412
0.06302302630941503
0.062402100016251405
0.06180869579118147
0.061240892250484326
0.060696948871444195
0.06017528472950352
0.059674460215851405
0.05919316125291408
0.05873018561336687
0.0582844310185871
0.057854884748870884
0.057440614543244756
0.05704076060362117
0.05665452854814898
0.05628118318327548
0.05592004298434244
0.05557047519132734
0.05523189144028508
0.05490374386266945
0.05458552159444504
0.05

In [61]:
# 进行预测
import pandas as pd
test_data = np.loadtxt("digit-recognizer/test.csv", delimiter=",", skiprows=1)
test_m = test_data.shape[0]
test_data = np.column_stack((test_data, np.ones((test_m, 1))))

predict = np.zeros((test_m,1))
temp_predict = np.zeros((test_m, 10))

for i in range(10):
    temp_predict[:,i] = 1.0 / (1.0 + np.exp(-1 * np.dot(test_data, w[:,i])))

predict = np.argmax(temp_predict, axis=1)

np.savetxt("digit-recognizer/sample_submission.csv", predict, fmt="%d", delimiter=",")