In [2]:
import numpy as np
import math
from pathlib import Path
import struct
import matplotlib.pyplot as plt

In [2]:
# 使用numpy自带的tanh函数和softmax函数
def tanh(x):
    return np.tanh(x)

def soft_max(x):
    exp = np.exp(x - x.max())
    return exp / exp.sum()

In [3]:
dimensions = [28*28, 10]
activations = [tanh, soft_max]
# 第一层和第二层的分布，w的分布参考浙大胡老师的课程
distribution = [
    {'b': [0, 0]},
    {'b': [0, 0], 'w': [-math.sqrt(6/(dimensions[0]+dimensions[1])), math.sqrt(6/(dimensions[0]+dimensions[1]))]}
]

In [4]:
def init_parameters_b(layer):
    dist = distribution[layer]['b']
    # 给出dist[0]到dist[1]之间的随机数
    return np.random.rand(dimensions[layer]) * (dist[1] -dist[0]) + dist[0]

def init_parameters_w(layer):
    dist = distribution[layer]['w']
    return np.random.rand(dimensions[layer-1], dimensions[layer]) * (dist[1] -dist[0]) + dist[0]

def init_parameters():
    parameter = []
    for i in range(len(distribution)):
        layer_parameter = {}
        for j in distribution[i].keys():
            if j == 'b':
                layer_parameter['b'] = init_parameters_b(i)
                continue
            if j == 'w':
                layer_parameter['w'] = init_parameters_w(i)
                continue
        parameter.append(layer_parameter)
    return parameter

In [5]:
parameters = init_parameters()

In [6]:
def predict(img, parameters):
    layer0_in = img+parameters[0]['b']
    layer0_out = activations[0](layer0_in)

    layer1_in = np.dot(layer0_out, parameters[1]['w']) + parameters[1]['b']
    layer1_out = activations[1](layer1_in)
    return layer1_out

In [7]:
predict(np.random.rand(28*28), parameters).argmax()

8

In [3]:
dataset_path = Path('./../mnist')
train_img_path = dataset_path/'train-images.idx3-ubyte'
train_lab_path = dataset_path/'train-labels.idx1-ubyte'
test_img_path = dataset_path/'t10k-images.idx3-ubyte'
test_lab_path = dataset_path/'t10k-labels.idx1-ubyte'

In [4]:
# 以二进制文件读取
train_f = open(train_img_path, 'rb')

In [5]:
trian_num = 50000
valid_num = 10000
test_num = 10000

with open(train_img_path, 'rb') as f:
    struct.unpack('>4i', f.read(16))
    temp_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28*28)
    train_img = temp_img[:trian_num]
    valid_img = temp_img[trian_num:]

with open(test_img_path, 'rb') as f:
    struct.unpack('>4i', f.read(16))
    test_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28*28)


with open(train_lab_path, 'rb') as f:
    struct.unpack('>2i', f.read(8))
    temp_lab = np.fromfile(f, dtype=np.uint8)
    train_lab = temp_lab[:trian_num]
    valid_lab = temp_lab[trian_num:]

with open(test_lab_path, 'rb') as f:
    struct.unpack('>2i', f.read(8))
    test_lab = np.fromfile(f, dtype=np.uint8)

In [11]:
def show_train(index):
    print('label : {}'.format(train_lab[index]), end='\n ')
    plt.imshow(train_img[index].reshape(28, 28), cmap='gray')

def show_valid(index):
    print('label : {}'.format(valid_lab[index]), end='\n ')
    plt.imshow(valid_img[index].reshape(28, 28), cmap='gray')

def show_test(index):
    print('label : {}'.format(test_lab[index]), end='\n ')
    plt.imshow(test_img[index].reshape(28, 28), cmap='gray')

In [12]:
def d_soft_max(data):
    sm=soft_max(data)
    return np.diag(sm) - np.outer(sm, sm)

# def d_tanh(data):
#     return np.diag(1/(np.cosh(data))**2)

def d_tanh(data):
    return 1/(np.cosh(data))**2

In [13]:
differential = {soft_max: d_soft_max, tanh: d_tanh}

In [14]:
d_tanh([1, 2, 3, 4])
d_soft_max(np.array([1, 2, 3, 4]))

array([[ 0.03103085, -0.00279373, -0.00759413, -0.02064299],
       [-0.00279373,  0.07955019, -0.02064299, -0.05611347],
       [-0.00759413, -0.02064299,  0.18076935, -0.15253222],
       [-0.02064299, -0.05611347, -0.15253222,  0.22928869]])

In [15]:
h = 0.0000001
func = soft_max
input_len = 4

for i in range(input_len):
    test_input = np.random.rand(input_len)
    derivative = differential[func](test_input)
    value1 = func(test_input)
    test_input[i]+= h
    value2 = func(test_input)
    # print((value2 - value1)/h)
    # print(derivative[i])
    print(derivative[i] - (value2 - value1) / h)

[-4.87366553e-09  1.70216383e-09  1.35511823e-09  7.06160429e-10]
[ 1.44830090e-09 -4.89978003e-09  3.38688366e-10  1.16990049e-09]
[ 5.64259472e-10  1.09531345e-09 -3.58784968e-09  1.37316528e-09]
[ 1.37166211e-09  1.70217766e-09  1.30796536e-09 -4.65936087e-09]


In [16]:
h = 0.000001
func = tanh
input_len = 4

for i in range(input_len):
    test_input = np.random.rand(input_len)
    derivative = differential[func](test_input)
    value1 = func(test_input)
    test_input[i]+= h
    value2 = func(test_input)
    # print((value2 - value1)/h)
    # print(derivative[i])
    print(derivative[i] - (value2 - value1) / h)

[3.50129558e-07 8.15759640e-01 8.15759640e-01 8.15759640e-01]
[8.50046510e-01 3.29240547e-07 8.50046510e-01 8.50046510e-01]
[9.20867646e-01 9.20867646e-01 2.59112743e-07 9.20867646e-01]
[8.50246227e-01 8.50246227e-01 8.50246227e-01 3.29010209e-07]


In [17]:
# 单位阵
onehot = np.identity(dimensions[-1])

def sqr_loss(img, lab, parameters):
    y_pred = predict(img, parameters)
    y = onehot[lab]
    diff = y - y_pred
    return np.dot(diff, diff)

In [18]:
sqr_loss(train_img[0], train_lab[0], parameters)

1.0763839470062817

In [19]:
def grad_parameters(img, lab, parameters):
    layer0_in = img+parameters[0]['b']
    layer0_out = activations[0](layer0_in)

    layer1_in = np.dot(layer0_out, parameters[1]['w']) + parameters[1]['b']
    layer1_out = activations[1](layer1_in)
    
    diff = onehot[lab] - layer1_out
    act1 = np.dot(differential[activations[1]](layer1_in), diff)

    grad_b1 = -2 * act1
    grad_w1 = -2 * np.outer(layer0_out, act1)
    grad_b0 = -2 * differential[activations[0]](layer0_in) * np.dot(parameters[1]['w'], act1)

    return {'w1' : grad_w1, 'b1': grad_b1, 'b0': grad_b0}

In [20]:
grad_parameters(train_img[2], train_lab[2], init_parameters())

{'w1': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'b1': array([ 2.85451828e-03,  5.63163366e-03,  8.47206297e-06,  3.89114666e-02,
        -1.42770366e-01, -1.01433101e-03,  4.04320359e-02,  6.24767044e-03,
         3.17740885e-02,  1.79248118e-02]),
 'b0': array([ 9.10810212e-003, -7.35517704e-003,  1.20520036e-002,
         9.36193659e-003,  8.38669288e-003,  1.27927298e-003,
        -1.68559260e-004,  1.47535661e-003,  5.44590440e-004,
        -9.15104127e-003,  1.01025867e-002, -5.37917008e-003,
         1.18705116e-003, -2.18911404e-004,  8.63069876e-003,
         1.13569947e-002,  3.63299983e-004, -1.22092445e-002,
         1.27313198e-002, -5.91883432e-003, -5.72286023e-003,
        -9.45568986e-003,  1.51510854e-004,  1.03606468e-002,
         5.20801403e-004, -2.77877387e-00

In [21]:
# b1
h = 0.001
for i in range(10):
    img_i = np.random.randint(trian_num)
    test_parameters = init_parameters()
    derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)['b1']
    value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    test_parameters[1]['b'][i]+= h
    value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    print(derivative[i] - (value2 - value1) / h)

-6.542451023741624e-05
5.6187045760380805e-05
-2.0339701125818288e-05
-1.1198830974029628e-05
-4.1904865633199395e-05
-1.3602850310785036e-05
-1.2816827052154756e-05
-6.7069330874818325e-06
-2.4191521874723787e-05
-5.106199042380355e-05


In [22]:
# w1
grad_list = []
h = 0.00001
for i in range(28*28):
    for j in range(10):
        img_i = np.random.randint(trian_num)
        test_parameters = init_parameters()
        derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)['w1']
        value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
        test_parameters[1]['w'][i][j] += h
        value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
        grad_list.append(derivative[i][j] - (value2 - value1) / h)
np.abs(grad_list).max()

9.067388544725663e-07

In [23]:
# b0
grad_list = []
h = 0.00001
for i in range(28*28):
    img_i = np.random.randint(trian_num)
    test_parameters = init_parameters()
    derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)['b0']
    value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    test_parameters[0]['b'][i]+= h
    value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
    grad_list.append(derivative[i] - (value2 - value1) / h)
np.abs(grad_list).max()

1.3017149954296092e-08

In [24]:
# 精确度相关

def valid_loss(parameters):
    loss_accu = 0
    for img_i in range(valid_num):
        loss_accu += sqr_loss(valid_img[img_i], valid_lab[img_i], parameters)
    return loss_accu

def valid_accuracy(parameters):
    correct = [predict(valid_img[img_i], parameters).argmax() == valid_lab[img_i] for img_i in range(valid_num)]
    print('validation accuracy: {}'.format((correct.count(True)/len(correct)*100)), "%")

In [25]:
valid_loss(parameters)
valid_accuracy(parameters)

validation accuracy: 7.4399999999999995 %


In [26]:
# 100张图片的梯度平均值

batch_size = 100

def train_batch(currect_batch, parameters):
    grad_accu = grad_parameters(train_img[currect_batch*batch_size+0], train_lab[currect_batch*batch_size+0], test_parameters)
    for img_i in range(1, batch_size):
        grad_tmp = grad_parameters(train_img[currect_batch*batch_size+img_i], train_lab[currect_batch*batch_size+img_i], test_parameters)
        for key in grad_accu.keys():
            grad_accu[key] += grad_tmp[key]
    for key in grad_accu.keys():
        grad_accu[key] /= batch_size
    return grad_accu

import copy

def combine_parameters(parameters, grad, learn_rate):
    parameters_tmp = copy.deepcopy(parameters)
    parameters_tmp[0]['b'] -= learn_rate * grad['b0']
    parameters_tmp[1]['b'] -= learn_rate * grad['b1']
    parameters_tmp[1]['w'] -= learn_rate * grad['w1']
    return parameters_tmp

In [27]:
train_batch(0, parameters)

combine_parameters(parameters, train_batch(0, parameters), 1)

[{'b': array([-1.99318648e-03, -2.91135261e-03,  1.97094586e-04, -1.89776964e-03,
          3.89504426e-04, -1.65481944e-03, -2.12717188e-03,  1.01193842e-03,
         -1.31585455e-03,  5.74568002e-04, -1.93757586e-03, -5.31654312e-04,
          3.53767839e-03, -1.57614570e-04, -4.22137937e-03,  4.51447429e-03,
          4.79220495e-04,  1.79265761e-03,  2.68251432e-03, -6.31164029e-04,
         -8.20796943e-04, -2.47500806e-03,  3.62083541e-03, -3.42103842e-04,
          3.20702518e-03,  1.52163568e-03, -2.70001228e-03, -3.83205024e-03,
         -2.60459895e-03,  2.99791675e-03,  2.62556258e-03,  3.16773581e-03,
          2.90623759e-03,  3.98558645e-03, -3.73430071e-03, -2.84215809e-03,
          2.10717316e-03,  3.87085526e-03, -1.86337309e-03,  2.78907675e-03,
          1.90770021e-03,  1.82975558e-03, -1.29266742e-03,  3.90865283e-03,
          6.45706929e-04, -1.60497369e-04,  2.00986993e-03,  1.25713111e-03,
          6.44888493e-04,  2.81004331e-04, -4.72026790e-04, -1.21188712

In [28]:
### 训练过程

parameters = init_parameters()

for i in range(trian_num // batch_size):
    if i % 100 == 99:
        print("runing batch {}/{}".format(i+1, trian_num // batch_size))
    grad_tmp = train_batch(i, parameters)
    parameters = combine_parameters(parameters, grad_tmp, 0.01)
valid_accuracy(parameters)

runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 15.1 %


In [29]:
# 寻找较优learn_rate

def train_for_better():
    param = 0.1
    for j in range(300):
        parameters = init_parameters()
        for i in range(trian_num // batch_size):
            if i % 100 == 99: 
                print("runing batch {}/{}".format(i+1, trian_num // batch_size))
            grad_tmp = train_batch(i, parameters) 
            temp = param + 0.1
            parameters = combine_parameters(parameters, grad_tmp, temp)
        valid_accuracy(parameters)

In [178]:
train_for_better()

500
runing batch 500/500
validation accuracy: 44.17 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 43.53 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 43.89 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 44.07 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 43.79 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 42.44 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 43.34 %
runing batch 100/500
runing batch 200/500
runing batch 300/500
runing batch 400/500
runing batch 500/500
validation accuracy: 43.9199999999999