In [1]:
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from scipy.io import loadmat
from scipy.optimize import minimize

In [2]:
def load_data(filename):
    try:
        return loadmat(filename)
    except TypeError:
        print("Not a valid filename argument: " + filename)

In [3]:
def sigmoid(x, derive=False):
    if derive:
        return x * (1 - x)

    return 1/(1 + np.exp((-x)))

In [4]:
def sigmoid_gradient(z):
    sig_z = sigmoid(z)
    return np.multiply(sig_z, (1 - sig_z))

In [5]:
def forward_prop(X, theta_list):

    m = X.shape[0]
    a_list = []
    z_list = []
    
    #for thera in theta_list:
    #     print(thera.shape)
    
    for idx, thera in enumerate(theta_list):
        if idx == 0:
            #print("***** forward if ********")
            a_list.append(np.insert(X, 0, values=np.ones(m), axis=1))
            #print(a_list[0].shape)
            #print(theta_list[0].T.shape)
            #print((a_list[0] * (theta_list[0].T)).shape)
            z_list.append(a_list[0] * (theta_list[0].T))
            #print("***** forward if end********")
        else:
            a_list.append(np.insert(sigmoid(z_list[idx-1]), 0, values=np.ones(m), axis=1))
            #print("***** forward else ********")
            #print(z_list[idx-1].shape)
            #print(a_list[idx].shape)
            #print(theta_list[idx].T.shape)
            #print("***** forward else end********")
            temp = a_list[idx] * theta_list[idx].T
            z_list.append(a_list[idx] * theta_list[idx].T)

    h = sigmoid(z_list[len(z_list)-1])

    return a_list, z_list, h

In [6]:
def back_prop(params, input_size, hidden_layers, num_labels, X, y, learning_rate, regularize = False):

    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    theta_list = []
    startCount = 0
    for idx, val in enumerate(hidden_layers):
        if idx == 0:
            startCount = val * (input_size + 1)
            theta_list.append(np.matrix(np.reshape(params[:startCount], (val, (input_size + 1)))))
        if idx != 0:
            tempCount = startCount
            startCount += (val * (hidden_layers[idx-1] + 1))
            theta_list.append(np.matrix(np.reshape(params[tempCount:startCount], (val, (hidden_layers[idx-1] + 1)))))
        if idx == (len(hidden_layers)-1):
            theta_list.append(np.matrix(np.reshape(params[startCount:], (num_labels, (val + 1)))))


    a_list, z_list, h = forward_prop(X, theta_list)

    delta_list = []
    for theta in theta_list:
        delta_list.append(np.zeros(theta.shape))
        
    J = cost(X, y, h, theta_list, learning_rate, regularize)
    
    #print("***** a,z values ********")
    #for aa in a_list:
        #print(aa.shape)
        
    #for zz in z_list:
        #print(zz.shape)
    #print("***** a,z values end ********")
    
    d_list = []
    d_list.append(h - y)
    
    for idx, theta in reversed(list(enumerate(theta_list))):
        if idx != 0:
            #print(idx)
            z_list[len(z_list)-1-idx] = np.insert(z_list[len(z_list)-1-idx], 0, values=np.ones(1), axis=1)
            #print(((theta.T) * (d_list[len(d_list)-1].T)).T.shape)
            #print(sigmoid_gradient(z_list[len(z_list)-1-idx]).shape)
            #print(z_list[len(z_list)-1-idx].shape)
            d_list.append(np.multiply(( (theta.T) * (d_list[len(d_list)-1].T) ).T, sigmoid_gradient(z_list[len(z_list)-1-idx])))

    #print(len(d_list))
    d_list.reverse()

    #print(len(d_list))
    for idx, delta in enumerate(delta_list):
        #print(idx)
        if idx == (len(delta_list) - 1):
            #print((d_list[idx].T).shape)
            #print(a_list[idx].shape)
            delta_list[idx] += (d_list[idx].T) * a_list[idx]
        else:
            #print((d_list[idx][:, 1:].T).shape)
            #print(a_list[idx].shape)
            delta_list[idx] += (d_list[idx][:, 1:].T) * a_list[idx]
            
        delta[idx] = delta[idx] / m


   
    if regularize:
        for idx, delta in enumerate(delta_list):
            delta_list[idx][:, 1:] = delta_list[idx][:, 1:] + (theta_list[idx][:, 1:] * learning_rate) / m

    
    grad = np.concatenate((np.ravel(delta_list[0]), np.ravel(delta_list[1])))

    return J, grad

In [7]:
def cost(X, y, h, theta_list, learning_rate, regularize=False):

    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)

    J = (np.multiply(-y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))).sum() / m

        
    if regularize:
        regularization_value = 0.0
        for theta in theta_list:
            regularization_value += np.sum(np.power(theta[:, 1:], 2))
        J += (float(learning_rate) / (2 * m)) * regularization_value
        

    return J

In [8]:
def run_net2():

    input_size = 400
    hidden_layers = [25]
    num_labels = 10
    learning_rate = 1

    data = load_data('data/ex3data1.mat')
    X = data['X']  
    y = data['y']  

    print(X.shape, y.shape)

    encoder = OneHotEncoder(sparse=False)
    y_encoded = encoder.fit_transform(y)
    
    print(y_encoded.shape)
    
    total_param_count = 0
    for idx, val in enumerate(hidden_layers):
        if idx == 0:
            total_param_count += val * (input_size + 1)
        if idx != 0:
            total_param_count += val * (hidden_layers[idx-1] + 1)
        if idx == (len(hidden_layers)-1):
            total_param_count += num_labels * (val + 1)
    

    params = (np.random.random(size=total_param_count) - 0.5) * 0.25
    
    print("Running the backpropagation algorithm...")
    start_time = time.time()

    fmin = minimize(fun=back_prop, x0=params, args=(input_size, hidden_layers, num_labels, X, y_encoded, learning_rate),
                    method='TNC', jac=True, options={'maxiter': 250})
   

    print("Result: ", fmin)


In [9]:
np.random.seed(3)
run_net2()

(5000, 400) (5000, 1)
(5000, 10)
Running the backpropagation algorithm...


  J = (np.multiply(-y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))).sum() / m
  J = (np.multiply(-y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))).sum() / m


Result:       fun: 2.567584339495157
     jac: array([8.85592837e-03, 0.00000000e+00, 0.00000000e+00, ...,
       3.07608117e+02, 3.07629241e+02, 3.07517753e+02])
 message: 'Linear search failed'
    nfev: 144
     nit: 6
  status: 4
 success: False
       x: array([ 0.01280054,  0.05203696, -0.05227382, ..., -0.14104095,
       -0.07289643, -0.04257702])
