In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
#load
df = np.load('mnist.npz') 
#split
x_test  = df['x_test']
y_test  = df['y_test']
x_train = df['x_train']
y_train = df['y_train']
x_train = x_train/255
x_test  = x_test/255
x_train = x_train.reshape(-1,784)
x_test  = x_test.reshape(-1,784)
bias_x_train  = -np.ones((60000,1))
x_train = np.concatenate((x_train, bias_x_train), axis = 1)
bias_x_test  = -np.ones((10000,1))
x_test  = np.concatenate((x_test, bias_x_test), axis = 1)
N = 300

In [3]:
X = x_train[:20000]
y = y_train[:20000]

In [4]:
class layers:
    def forward(self, X, w_1, w_2):
        bias_h  = -np.ones((1,))
        v_1 = np.matmul(w_1, X) # data*weights+bias
        v_1_concat = np.concatenate((v_1, bias_h), axis = 0)
        h_1 = self.activation(v_1_concat)
        v_2 = np.matmul(w_2, h_1)
        return v_1_concat, h_1, v_2, v_1
    
    def forward_2(self, X, w_1, w_2):
        bias_h  = -np.ones((1,))
        v_1 = np.matmul(w_1, X) # data*weights+bias
        v_1_concat = np.concatenate((v_1, bias_h), axis = 0)
        h_1 = self.activation_2(v_1_concat)
        v_2 = np.matmul(w_2, h_1)
        return v_1_concat, h_1, v_2, v_1

    def activation(self, x):   # used step here
        y = (np.exp(2*x) - 1)/(np.exp(2*x) + 1)
        return y
    
    def activation_2(self, x):   # used step here
        y = np.maximum(0, x)
        return y
    
    def activation_3(self, x):   # used step here
        y = 1/(1 + np.exp(-x))
        return y
    
    def derivative(self,x):
        y = 1 - ((np.exp(2*x) - 1)/(np.exp(2*x) + 1))**2
        return y
    
    def derivative_2(self,x):
        y = x > 0
        return y
    
    def derivative_3(self,x):
        y = (1/(1 + np.exp(-x)))*(1 - 1/(1 + np.exp(-x)))
        return y
    
    def predict(self, X, w_1, w_2):     
        z = self.forward(X, w_1, w_2)[2] #forward on the data
        y = self.activation(z) #apply the activation function
        return y
    
    def predict_2(self, X, w_1, w_2):     
        z = self.forward_2(X, w_1, w_2)[2] #forward on the data
        y = self.activation_3(z) #apply the activation function
        return y

    def gradient(self, X, y, w_1, w_2, w_1_concat, w_2_concat, N, l_c):
        
        h_1_reshaped       = np.reshape(self.forward(X, w_1_concat, w_2_concat)[1],(N+1, 1))
        delta_w_2          = self.error(X, y)*self.derivative(self.forward(X, w_1_concat, w_2_concat)[2])
        delta_w_2_reshaped = np.reshape(delta_w_2,(10, 1))
        update_w_2         = np.matmul(delta_w_2_reshaped, np.transpose(h_1_reshaped))

        delta_w_1          = self.derivative(self.forward(X, w_1_concat, w_2_concat)[3]) * np.matmul(np.transpose(w_2), delta_w_2)
        delta_w_1_reshaped = np.reshape(delta_w_1,(N, 1))
        X                  = np.reshape(X, (785, 1))
        update_w_1         = np.matmul(delta_w_1_reshaped, np.transpose(X))

        w_1_concat += l_c*update_w_1
        w_2_concat += l_c*update_w_2
        w_1 = w_1_concat[:,:-1]
        w_2 = w_2_concat[:,:-1]
        
        return w_1, w_2, w_1_concat, w_2_concat
    
    def gradient_2(self, X, y, w_1, w_2, w_1_concat, w_2_concat, N, l_c):
        
        h_1_reshaped       = np.reshape(self.forward_2(X, w_1_concat, w_2_concat)[1],(N+1,1))
        delta_w_2          = self.error_2(X, y)*self.derivative_3(self.forward_2(X, w_1_concat, w_2_concat)[2])
        delta_w_2_reshaped = np.reshape(delta_w_2,(10,1))
        update_w_2         = np.matmul(delta_w_2_reshaped, np.transpose(h_1_reshaped))

        delta_w_1          = self.derivative_2(self.forward_2(X, w_1_concat, w_2_concat)[3]) * np.dot(delta_w_2, w_2)
        delta_w_1_reshaped = np.reshape(delta_w_1,(N,1))
        X                  = np.reshape(X, (785,1))
        update_w_1         = np.matmul(delta_w_1_reshaped, np.transpose(X))

        w_1_concat += l_c*update_w_1
        w_2_concat += l_c*update_w_2
        w_1 = w_1_concat[:,:-1]
        w_2 = w_2_concat[:,:-1]
            
        return w_1, w_2, w_1_concat, w_2_concat
                       
    def pre_assign(self, y):
        y_label=-np.ones((10,))
        y_label[y]=1
        return y_label
    
    def pre_assign_2(self, y):
        y_label = np.zeros((10,))
        y_label[y] = 1
        return y_label
    
    def error(self, X, y):
        dif = self.pre_assign(y) - self.predict(X, w_1_concat, w_2_concat)
        return dif
    
    def error_2(self, X, y):
        dif = self.pre_assign_2(y) - self.predict_2(X, w_1_concat, w_2_concat)
        return dif
    
    def error_reg(self, X, y, w_1_concat, w_2_concat, coef):
        dif = self.pre_assign_2(y) - self.predict_2(X, w_1_concat, w_2_concat) + coef* np.sum(np.square(w_2_concat))
        return dif

In [5]:
N_list   = [300, 500, 1000]
l_c_list = [0.01, 0.05, 0.09]
for n in N_list:
    N = n
    for case in range(1,3):
        for i in l_c_list:
            l_c = i
            if case == 1:
                w_1 = np.random.normal(scale=0.01, size = (N, 784))
                b_1 = np.zeros((N,1))
                w_2 = np.random.normal(scale=0.01, size = (10, N))
                b_2 = np.zeros((10,1))

                w_1_concat = np.concatenate((w_1, b_1), axis = 1)
                w_2_concat = np.concatenate((w_2, b_2), axis = 1)
                
                start = time.time()
                
                X = x_train[:10000]
                y = y_train[:10000]
                
                for j in range(5):
                    for i in range(len(X)):
                        w_1, w_2, w_1_concat, w_2_concat = layers().gradient(X[i], y[i], w_1, w_2, w_1_concat, w_2_concat, N, l_c)
  
                err = 0
                predictions = np.zeros((len(y),10))
                results = np.zeros(len(y))

                for i in range(len(X)):
                    predictions[i] = layers().predict(X[i], w_1_concat, w_2_concat)
                    results[i]     = np.argmax(predictions[i])
                    err += np.sum(np.square(predictions[i] - layers().pre_assign(y[i]))/2, axis = 0)
                err /= 10000
                accuracy_train = 100*(1 - np.sum(y != results)/len(y))
                end = time.time()
                time_passed = end - start
                print('Case 1 \nN = %.d\nLearning Rate     = %.2f \nTraining Error    = %.4f \nTraining Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (N, l_c, err, accuracy_train, time_passed))
                
                X = x_test
                y = y_test
                
                err = 0
                predictions = np.zeros((len(y),10))
                results = np.zeros(len(y))
                start = time.time()
                for i in range(len(X)):
                    predictions[i] = layers().predict(X[i], w_1_concat, w_2_concat)
                    results[i]     = np.argmax(predictions[i])
                    err += np.sum(np.square(predictions[i] - layers().pre_assign(y[i]))/2, axis = 0)
                err /= 10000
                accuracy_train = 100*(1 - np.sum(y != results)/len(y))
                end = time.time()
                time_passed = end - start
                print('Case 1 \nN = %.d\nLearning Rate     = %.2f \nTesting Error    = %.4f \nTesting Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (N, l_c, err, accuracy_train, time_passed))

            else:
                w_1 = np.random.normal(scale=0.01, size = (N, 784))
                b_1 = np.zeros((N,1))
                w_2 = np.random.normal(scale=0.01, size = (10, N))
                b_2 = np.zeros((10,1))

                w_1_concat = np.concatenate((w_1, b_1), axis = 1)
                w_2_concat = np.concatenate((w_2, b_2), axis = 1)
                
                start = time.time()
                
                X = x_train[:10000]
                y = y_train[:10000]
                
                for i in range(5):
                    for i in range(len(X)):
                        w_1, w_2, w_1_concat, w_2_concat = layers().gradient_2(X[i], y[i], w_1, w_2, w_1_concat, w_2_concat, N, l_c)
                err = 0
                predictions = np.zeros((len(y),10))
                results = np.zeros(len(y))

                for i in range(len(X)):
                    predictions[i] = layers().predict_2(X[i], w_1_concat, w_2_concat)
                    results[i]     = np.argmax(predictions[i])
                    err += np.sum(np.square(predictions[i] - layers().pre_assign_2(y[i]))/2, axis = 0)
                err /= 10000
                accuracy_train = 100*(1 - np.sum(y != results)/len(y))
                end = time.time()
                time_passed = end - start
                print('Case 2 \nN = %.d\nLearning Rate     = %.2f \nTraining Error    = %.4f \nTraining Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (N, l_c, err, accuracy_train, time_passed))
                
                X = x_test
                y = y_test
                
                err = 0
                predictions = np.zeros((len(y),10))
                results = np.zeros(len(y))
                start = time.time()
                for i in range(len(X)):
                    predictions[i] = layers().predict(X[i], w_1_concat, w_2_concat)
                    results[i]     = np.argmax(predictions[i])
                    err += np.sum(np.square(predictions[i] - layers().pre_assign(y[i]))/2, axis = 0)
                err /= 10000
                accuracy_train = 100*(1 - np.sum(y != results)/len(y))
                end = time.time()
                time_passed = end - start
                print('Case 2 \nN = %.d\nLearning Rate     = %.2f \nTesting Error    = %.4f \nTesting Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (N, l_c, err, accuracy_train, time_passed))

Case 1 
N = 300
Learning Rate     = 0.01 
Training Error    = 0.1882 
Training Accuracy = 94.8300 
Completation Time = 157.69 seconds

Case 1 
N = 300
Learning Rate     = 0.01 
Testing Error    = 0.2647 
Testing Accuracy = 92.7600 
Completation Time = 1.08 seconds

Case 1 
N = 300
Learning Rate     = 0.05 
Training Error    = 0.7236 
Training Accuracy = 79.5700 
Completation Time = 131.58 seconds

Case 1 
N = 300
Learning Rate     = 0.05 
Testing Error    = 0.7639 
Testing Accuracy = 78.3800 
Completation Time = 0.99 seconds

Case 1 
N = 300
Learning Rate     = 0.09 
Training Error    = 1.6337 
Training Accuracy = 29.0500 
Completation Time = 124.18 seconds

Case 1 
N = 300
Learning Rate     = 0.09 
Testing Error    = 1.6550 
Testing Accuracy = 28.3300 
Completation Time = 0.92 seconds

Case 2 
N = 300
Learning Rate     = 0.01 
Training Error    = 0.0632 
Training Accuracy = 93.0000 
Completation Time = 116.40 seconds

Case 2 
N = 300
Learning Rate     = 0.01 
Testing Error    = 1.8886

KeyboardInterrupt: 

In [5]:
class layers_2:
    def forward(self, X, w_1, w_2):
        bias_h  = -np.ones((1, X.shape[1]))
        v_1 = np.matmul(w_1, X) # data*weights+bias
        v_1_concat = np.concatenate((v_1, bias_h), axis = 0)
        h_1 = self.activation(v_1_concat)
        v_2 = np.matmul(w_2, h_1)
        return v_1_concat, h_1, v_2, v_1

    def activation(self, x):   # used step here
        y = (np.exp(2*x) - 1)/(np.exp(2*x) + 1)
        return y
    
    def derivative(self,x):
        y = 1 - ((np.exp(2*x) - 1)/(np.exp(2*x) + 1))**2
        return y
    
    def predict(self, X, w_1, w_2):     
        z = self.forward(X, w_1, w_2)[2] #forward on the data
        y = self.activation(z) #apply the activation function
        return y

    def gradient(self, X, y, w_1, w_2, w_1_concat, w_2_concat, N, l_c, batch_size):
        
        h_1                = self.forward(X, w_1_concat, w_2_concat)[1]
        delta_w_2          = self.error(X, y)*self.derivative(self.forward(X, w_1_concat, w_2_concat)[2])
        update_w_2         = np.matmul(delta_w_2, np.transpose(h_1))/batch_size

        delta_w_1          = self.derivative(self.forward(X, w_1_concat, w_2_concat)[3]) * np.matmul(np.transpose(w_2), delta_w_2)
        update_w_1         = np.matmul(delta_w_1, np.transpose(X))/batch_size

        w_1_concat += l_c*update_w_1
        w_2_concat += l_c*update_w_2
        w_1 = w_1_concat[:,:-1]
        w_2 = w_2_concat[:,:-1]
        
        return w_1, w_2, w_1_concat, w_2_concat
                       
    def pre_assign(self, X, y):
        y_label=-np.ones((10, X.shape[1]))
        for i in range(len(y)):
            y_label[:,i][y[i]] = 1
        return y_label
    
    def pre_assign_2(self, y):
        y_label=-np.ones((10, ))
        y_label[y] = 1
        return y_label
    
    def error(self, X, y):
        dif = self.pre_assign(X, y) - self.predict(X, w_1_concat, w_2_concat)
        return dif

In [14]:
batch_size_list = [10, 50, 100]
l_c = 0.01
case = 1
for batch_size in batch_size_list:
    w_1 = np.random.normal(scale=0.01, size = (N, 784))
    b_1 = np.zeros((N,1))
    w_2 = np.random.normal(scale=0.01, size = (10, N))
    b_2 = np.zeros((10,1))

    w_1_concat = np.concatenate((w_1, b_1), axis = 1)
    w_2_concat = np.concatenate((w_2, b_2), axis = 1)
    start= time.time()
    for i in range(100):
        for i in range(len(X)//batch_size):
            a = np.transpose(X[batch_size*i:batch_size*(i+1)])
            b = np.transpose(y[batch_size*i:batch_size*(i+1)])
            w_1, w_2, w_1_concat, w_2_concat = layers_2().gradient(a, b , w_1, w_2, w_1_concat, w_2_concat, N, l_c, batch_size)
    err = 0
    predictions = np.zeros((len(y),10))
    results = np.zeros(len(y))

    for i in range(len(X)):
        a = np.reshape(X[i], (785, 1))
        predictions[i] = np.reshape(layers_2().predict(a, w_1_concat, w_2_concat), (10,))
        results[i]     = np.argmax(predictions[i])
        err += np.sum(np.square(predictions[i] - layers_2().pre_assign_2(y[i]))/2, axis = 0)

    err /= 10000
    accuracy_train = 100*(1 - np.sum(y != results)/len(y))
    end = time.time()
    time_passed = end - start
    print('Case 1 \nMini Batch Size = %.d\nN = %.d\nLearning Rate     = %.2f \nTraining Error    = %.4f \nTraining Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (batch_size, N, l_c, err, accuracy_train, time_passed))

Case 1 
Mini Batch Size = 10
N = 300
Learning Rate     = 0.01 
Training Error    = 0.0476 
Training Accuracy = 98.1889 
Completation Time = 313.00 seconds



KeyboardInterrupt: 

In [8]:
class layers_3:
    def forward(self, X, w_1, w_2):
        bias_h  = -np.ones((1, X.shape[1]))
        v_1 = np.matmul(w_1, X) # data*weights+bias
        v_1_concat = np.concatenate((v_1, bias_h), axis = 0)
        h_1 = self.activation(v_1_concat)
        v_2 = np.matmul(w_2, h_1)
        return v_1_concat, h_1, v_2, v_1

    def activation(self, x):   # used step here
        y = (np.exp(2*x) - 1)/(np.exp(2*x) + 1)
        return y
    
    def derivative(self,x):
        y = 1 - ((np.exp(2*x) - 1)/(np.exp(2*x) + 1))**2
        return y
    
    def predict(self, X, w_1, w_2):     
        z = self.forward(X, w_1, w_2)[2] #forward on the data
        y = self.activation(z) #apply the activation function
        return y

    def gradient(self, X, y, w_1, w_2, w_1_concat, w_2_concat, N, l_c, coef):
        
        h_1                = self.forward(X, w_1_concat, w_2_concat)[1]
        delta_w_2          = self.error(X, y)*self.derivative(self.forward(X, w_1_concat, w_2_concat)[2])
        update_w_2         = np.matmul(delta_w_2, np.transpose(h_1))

        delta_w_1          = self.derivative(self.forward(X, w_1_concat, w_2_concat)[3]) * np.matmul(np.transpose(w_2), delta_w_2)
        update_w_1         = np.matmul(delta_w_1, np.transpose(X))

        w_1_concat += l_c*update_w_1
        w_2_concat += l_c*update_w_2
        w_1 = w_1_concat[:,:-1]
        w_2 = w_2_concat[:,:-1]
        
        return w_1, w_2, w_1_concat, w_2_concat
                       
    def pre_assign(self, X, y):
        y_label=-np.ones((10, X.shape[1]))
        for i in range(len(y)):
            y_label[:,i][y[i]] = 1
        return y_label
    
    def pre_assign_2(self, y):
        y_label=-np.ones((10, ))
        y_label[y] = 1
        return y_label
    
    def error(self, X, y):
        dif = self.pre_assign(X, y) - self.predict(X, w_1_concat, w_2_concat) + coef* np.sum(np.square(w_2_concat))
        return dif

In [9]:
batch_size = 10
l_c = 0.01
case = 1
lambda_list = [0.001, 0.01]
start= time.time()
for coef in lambda_list:
    w_1 = np.random.normal(scale=0.01, size = (N, 784))
    b_1 = np.zeros((N,1))
    w_2 = np.random.normal(scale=0.01, size = (10, N))
    b_2 = np.zeros((10,1))

    w_1_concat = np.concatenate((w_1, b_1), axis = 1)
    w_2_concat = np.concatenate((w_2, b_2), axis = 1)
    for i in range(1):
        for i in range(len(X)//10):
            a = np.transpose(X[10*i:10*(i+1)])
            b = np.transpose(y[10*i:10*(i+1)])
            w_1, w_2, w_1_concat, w_2_concat = layers_3().gradient(a, b , w_1, w_2, w_1_concat, w_2_concat, N, l_c, coef )
    err = 0
    predictions = np.zeros((len(y),10))
    results = np.zeros(len(y))

    for i in range(len(X)):
        a = np.reshape(X[i], (785, 1))
        predictions[i] = np.reshape(layers_3().predict(a, w_1_concat, w_2_concat), (10,))
        results[i]     = np.argmax(predictions[i])
        err += np.sum(np.square(predictions[i] - layers_3().pre_assign_2(y[i]))/2, axis = 0)

    err /= 10000
    accuracy_train = 100*(1 - np.sum(y != results)/len(y))
    end = time.time()
    time_passed = end - start
    print('Case 1 \nMini Batch Size = %.d\nLambda = %.3f\nN = %.d\nLearning Rate     = %.2f \nTraining Error    = %.4f \nTraining Accuracy = %.4f \nCompletation Time = %.2f seconds\n' % (batch_size, coef, N, l_c, err, accuracy_train, time_passed))

Case 1 
Mini Batch Size = 10
Lambda = 0.001
N = 300
Learning Rate     = 0.01 
Training Error    = 0.3703 
Training Accuracy = 88.7889 
Completation Time = 3.40 seconds

Case 1 
Mini Batch Size = 10
Lambda = 0.010
N = 300
Learning Rate     = 0.01 
Training Error    = 0.3981 
Training Accuracy = 89.1222 
Completation Time = 6.81 seconds

