In [2]:
import random
import numpy as np

In [6]:
class Network(object):
    def __init__(self,sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]]
        self.weights = [np.random.randn(y,x) for x,y in zip(sizes[:-1],sizes[1:])]
        
    
    def feedforward(self,a):
        """a 是输入向量，返回的是network在a上的输出向量"""
        for b,w in zip(self.biases , self.weights):
            a = sigmoid(np.dot(w,a)+b)
        return a
    
    
    def SGD(self , training_data , epochs , mini_batch_size ,eta , test_data=None):
        """随机梯度下降 stochastic gradient descent
            
            training_data:2-元组的训练数据，来自mnist_loader模块
            epochs:全局迭代次数，整个训练样本要过几遍
            mini_batch_size:每次随机批处理的样本个数
            eta:学习率
            
        """
        if test_data: n_test=len(test_data)
        print("before")
        n = len(training_data)
        print("after")
        for j in range(epochs):
            random.shuffle(training_data)#洗牌
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0,n,mini_batch_size)
            ]
            
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch,eta)
            
            if test_data:
                print("Epoch %d :%d / %d\n"%(j,self.evaluate(test_data),n_test))
            else:
                print("Epoch %d complete\n"%(j))
        
        
    def update_mini_batch(self,mini_batch,eta):
        """用一小批随机训练样本搞个梯度学习一波"""
        nable_b = [np.zeros(b.shape) for b in self.biases]
        nable_w = [np.zeros(w.shape) for w in self.weights]
        #nable_b, nable_w:the variable saveing the gradient of the batch
        
        for x,y in mini_batch:
            delta_nable_b, delta_nable_w = self.backprop(x,y)
            #test if the backporp work well
            #t_delta_b,t_delta_w = self.test_derivative(x,y)
            #print("the distance between two derivatives:\n")
            #print("bias:",[d-t for d,t in zip(delta_nable_b,t_delta_b)],"weight:",[d-t for d,t in zip(delta_nable_w,t_delta_w)])
            nable_b = [nb+dnb for nb,dnb in zip(nable_b,delta_nable_b)]
            nable_w = [nw+dnw for nw,dnw in zip(nable_w,delta_nable_w)]
            
        self.weights = [w - (eta/len(mini_batch))*nw for w,nw in zip(self.weights,nable_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b,nb in zip(self.biases,nable_b)]
        
    def backprop(self,x,y):
        """返回单个样本的代价函数梯度，格式是个(nable_b,nable_w)"""
        
        nable_b = [np.zeros(b.shape) for b in self.biases]
        nable_w = [np.zeros(w.shape) for w in self.weights]
        
        activations=[x]
        zs = []
        activation = x
        for b,w in zip(self.biases,self.weights):
            z = np.dot(w,activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
            
        delta = self.cost_derivative(activations[-1],y)*sigmoid_prime(zs[-1])
        nable_b[-1] = delta
        nable_w[-1] = np.dot(delta,activations[-2].transpose())
        
        for l in range(2,self.num_layers):
            delta = np.dot(self.weights[-l+1].transpose(),delta)*sigmoid_prime(zs[-l])
            nable_b[-l] = delta
            nable_w[-l] = np.dot(delta,activations[-l-1].transpose())
        return (nable_b,nable_w)
    
    def test_derivative(self,x,y):
        """single x's derivative"""
        #print(self.biases)
        nable_w = [np.zeros(w.shape) for w in self.weights]
        nable_b = [np.zeros(b.shape) for b in self.biases]
        theta=0.01
        #print("shape:",self.biases[1].shape)
        for ind_layer,wmatrix in enumerate(self.weights):
            for ind_row,wrow in enumerate(wmatrix):
                for ind_col,w in enumerate(wrow):
                    item = self.weights[ind_layer][ind_row][ind_col]
                    #print("???")
                    self.weights[ind_layer][ind_row][ind_col] = item+theta
                    head_v = self.feedforward(x)
                    self.weights[ind_layer][ind_row][ind_col] = item-theta
                    tail_v = self.feedforward(x)
                    #print("before2")
                    nable_w[ind_layer][ind_row][ind_col] = (self.cost_value(head_v,y) - self.cost_value(tail_v,y))/(2.0*theta)
                    #print("after2")
                    self.weights[ind_layer][ind_row][ind_col] = item
        #print("now bias")
        for ind_layer,bmatrix in enumerate(self.biases):
            for ind_row,brow in enumerate(bmatrix):
                for ind_col,b in enumerate(brow):
                    item = self.biases[ind_layer][ind_row][ind_col]
                    #print("???",item)
                    self.biases[ind_layer][ind_row][ind_col] = item+theta
                    #print("wtf")
                    head_v = self.feedforward(x)
                    #print("son of bitch")
                    self.biases[ind_layer][ind_row][ind_col] = item-theta
                    tail_v = self.feedforward(x)
                    nable_b[ind_layer][ind_row][ind_col] = (self.cost_value(head_v,y) - self.cost_value(tail_v,y))/(2.0*theta)
                    self.biases[ind_layer][ind_row][ind_col] = item
        
        return (nable_b,nable_w)
    
    def cost_value(self,a,y):
        return (np.dot((a-y).transpose() , (a-y))*0.5)
                    
    
    def evaluate(self, test_data):
        """返回测试数据中判断正确的数目"""
        test_results = [(np.argmax(self.feedforward(x)),y) for (x,y ) in test_data]
        return sum(int(x==y) for (x,y) in test_results)
    
    def cost_derivative(self , output, y):
        return (output - y)
    
    

In [5]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
    return sigmoid(z)*(1 - sigmoid(z))