In this exercise, you will implement the backpropagation algorithm for neural networks and apply it to the task of hand-written digit recognition. 
## Neural Networks
- **Visualizing the data**
- **Model representation**
- **Feedforward and cost function**
- **Regularized cost function**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.optimize as op

In [2]:
# Neural network model
data = sio.loadmat("ex4data1.mat")
weights = sio.loadmat("ex4weights.mat")
x = data['X']
y = data['y'].reshape(5000,)
Theta1 = weights['Theta1']        # weights or parameters of layer1
Theta2 = weights['Theta2']        # weights or parameters of layer2
print(x.shape, y.shape)

X = np.hstack((np.ones((5000, 1)), x))  # layer1(input layer) has 400 + 1 units
#Theta1 = np.zeros((25, 401))
#Theta2 = np.zeros((10, 26))
print(X.shape, Theta1.shape, Theta2.shape)

(5000, 400) (5000,)
(5000, 401) (25, 401) (10, 26)


In [3]:
# Sigmoid function
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [4]:
# Feedforward and cost function
m = X.shape[0]
K = 10                #total number of possible labels.

Y = np.zeros((m, K))
for i in range(m):    #recode the labels as vertors containing only values 0 or 1
    if y[i] == 10:
        Y[i, 0] = 1
    else:
        Y[i, y[i]] = 1
#print(Y.shape)

z2 = np.dot(X, Theta1.T)
a2 = sigmoid(z2)           
a2 = np.hstack((np.ones((a2.shape[0],1)), a2))   # layer2(hidden layer) has 25 + 1 units
z3 = np.dot(a2, Theta2.T)
a3 = sigmoid(z3)            # layer3(output layer) has K units
h = np.vstack((a3[500:, :], a3[:500, :]))   # 这里这样处理的原因是因为matlab和python索引规则不同，具体原因见ex3。

costJ = (1/m) * np.sum(-Y * np.log(h) - (1-Y) * np.log(1-h))
print(costJ)

0.28762916516131887


In [5]:
# 法二
J = 0
for i in range(m):
    tmp = 0
    for k in range(K):
        tmp = tmp + (-Y[i,k]*np.log(h[i,k]) - (1-Y[i,k])*np.log(1-h[i,k]))
    J = J+tmp
J = J/5000
print(J)

0.2876291651613191


In [6]:
# Regularized cost function
lam = 1
regJ = costJ + (lam/(2*m))*(np.sum(Theta1[:, 1:]**2) + np.sum(Theta2[:, 1:]**2))  # 注意：theta的第一列不要正则化
print(regJ)

0.3837698590909236


## Back propagation
- **Sigmoid gradient**
- **Random initialization**
- **Back propagation gradient**
- **Gradient checking**
- **Regularized Neural Networks**

In [8]:
# Sigmoid gradient function
def sigmoidGradient(z):
    return sigmoid(z)*(1-sigmoid(z))

In [9]:
# Random initialization
def randInitializeWeights(L_in, L_out):
    epsilon_init = 0.12
    #W = np.random.uniform(0, 1, (L_out, L_in)) * 2*epsilon_init - epsilon_init
    W = np.random.rand(L_out, L_in + 1) * 2*epsilon_init - epsilon_init
    return W

In [10]:
# Backpropagation gradient
delta3 = h - Y
delta2 = np.dot(delta3, Theta2) * np.hstack((np.ones((m, 1)), sigmoidGradient(z2)))
delta2 = delta2[:, 1:]

theta2_grad = np.dot(delta3.T, a2) / m
theta2_grad[:, 1:] = theta2_grad[:, 1:] + (lam/m)*Theta2[:, 1:]   #Regularized Neural Networks
theta1_grad = np.dot(delta2.T, X) / m
theta1_grad[:, 1:] = theta1_grad[:, 1:] + (lam/m)*Theta1[:, 1:]   #Regularized Neural Networks
print(delta3.shape, delta2.shape, theta2_grad.shape, theta1_grad.shape)

(5000, 10) (5000, 25) (10, 26) (25, 401)


In [11]:
# Unrolling parameters
def unrollingParams(Theta1, Theta2):
    unrolling_theta = np.append(Theta1.flatten(), Theta2.flatten())
    return unrolling_theta

In [12]:
# Gradient checking
#def computeNumericalGradient():
