In [48]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io #Used to load the OCTAVE *.mat files
import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
import random #To pick random images to display
import math
import itertools
from scipy.special import expit #Vectorized sigmoid function
import array

In [49]:
dataFile = "data/ex4data1.mat"
data = scipy.io.loadmat(dataFile)

X = data['X']
y = data['y']

X = np.insert(X, 0, 1, axis = 1)


In [50]:
# taken from ex3
def reshapeRow(row):
    """
    @param {row} 1 x 401 matrix since an image of a digit is 20x20 + 1 that was added as a bias
    Function that takes in the pixel intensity values and puts it into a 20x20 square 
    """ 
    # the [1:] is used to take everything after the 1st index 
    
    return row[1:].reshape(20,20).T

def displayData(indiciesToDisplay = None):
    """
    Function that selects 100 random examples for the 5000 we have and organizes
    them into a 10x10 matrix
    """
    width = 20
    height = 20
    numRows = 10
    numCols = 10
    
    if not indiciesToDisplay:
        indiciesToDisplay = random.sample(range(X.shape[0]), numRows * numCols)

    
    bigPicture = np.zeros((height * numRows, width * numCols))
    
    iRow = 0
    iCol = 0

    for i in indiciesToDisplay :
        if iCol == numCols:
            iCol = 0
            iRow += 1
 
        curImg = reshapeRow(X[i])
        bigPicture[iRow * height :iRow * height + curImg.shape[0], 
                    iCol * width : iCol * width + curImg.shape[1]] = curImg
        iCol += 1 
    fig = plt.figure( figsize = (6,6) )
    img = scipy.misc.toimage( bigPicture )
    plt.imshow(img,cmap = cm.Greys_r)

#displayData()

In [51]:
thetaFile = "data/ex4weights.mat"
thetas1 = scipy.io.loadmat(thetaFile)

# Theta1.shape = 25 x 401
Theta1 = thetas1['Theta1']

# Theta2.shape = 10 x 26
Theta2 = thetas1['Theta2']

thetaUnrolled = np.r_[Theta1.ravel(), Theta2.ravel()]

inputSize = 400
hiddenSize = 25
outputSize = 10


# m = 5000
m = X.shape[0]
# n = 401
n = X.shape[1]

aVals = [None] * 3
zVals = [None] * 2

In [94]:
# the following 2 functions are needed because fmin_cg passes in X as an unrolled vector
def reshape(X):
    return np.array(X).reshape((m,n))

def flatten(X):
    return np.array(X.flatten()).reshape((X.shape[0]*(X.shape[1]),1))


# used to flatted D1, D2, taken from kaleko github
def flattenParams(thetas_list):
    input_layer_size = 400
    hidden_layer_size = 25
    output_layer_size = 10
    """
    Hand this function a list of theta matrices, and it will flatten it
    into one long (n,1) shaped numpy array
    """
    flattened_list = [ mytheta.flatten() for mytheta in thetas_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size+1)*hidden_layer_size + \
                            (hidden_layer_size+1)*output_layer_size
    return np.array(combined).reshape((len(combined),1))

def sigmoid(X, theta):
    return expit(np.dot(X,theta))

def costFunction(thetas, X, y, lmbda):
    
    X = reshape(X)
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    m = X.shape[0]

    a1 = X # 5000 x 401
    z2 = theta1.dot(X.T)
    
    a2 = expit(z2) # 25 x 5000
    a2 = np.insert(a2,0,1,axis=0)
       
    z3 = theta2.dot(a2)
    a3 = expit(z3)
    
    aVals[0] = a1
    aVals[1] = a2
    aVals[2] = a3
    
    zVals[0] = z2
    zVals[1] = z3
     
    tempY = pd.get_dummies(y.ravel()).values
    
    J = (-1/m)*np.sum(np.log(a3.T)*(tempY)+np.log(1-a3).T*(1-tempY)) + \
    (lmbda/(2*m))*(np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))  
    
    return J

In [95]:
def testCost():
    J = costFunction(thetaUnrolled,X,y,1)
    print("Expected value is ~ 0.383\nActual value %.9f"%J)
testCost()

Expected value is ~ 0.383
Actual value 0.383769859


In [96]:
def gradientSigmoid(z):
    return expit(z)*(1-expit(z))

def initRandomThetas():
    epsilon = 0.12
    theta1Size = (hiddenSize, inputSize + 1)
    theta2Size = (outputSize, hiddenSize + 1)
    
    
    theta1 = np.random.rand(*theta1Size).ravel()*2*epsilon - epsilon
    theta2 = np.random.rand(*theta2Size).ravel()*2*epsilon - epsilon
    
    
    thetas = np.r_[Theta1.ravel(), Theta2.ravel()]
    return thetas

temp = initRandomThetas()


In [97]:
def backProp(thetas,X,y, lmbda):
    X = reshape(X)
    m = X.shape[0]
    
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    a1 = aVals[0]
    a2 = aVals[1].T
    a3 = aVals[2].T
    
    z2 = zVals[0].T
    z3 = zVals[1].T
    
        
    D1 = np.zeros((theta1.shape))
    D2 = np.zeros((theta2.shape))
    
    for i in range(m):
        a1Cur = X[i]
        a2Cur = a2[:,1:][i]
        z2Cur = z2[i]
        a3Cur = a3[i]
        z3Cur = z3[i]
    
        a1Cur = a1Cur.reshape(len(a1Cur),1)
        a2Cur = a2Cur.reshape(len(a2Cur),1)
        z2Cur = z2Cur.reshape(len(z2Cur),1)
        a3Cur = a3Cur.reshape(len(a3Cur),1)
        z3Cur = z3Cur.reshape(len(z3Cur),1)
        
        yT = np.zeros((10,1))
        yT[y[i] - 1] = 1
                
        delta3 = a3Cur - yT
        delta2 = theta2[:,1:].T.dot(delta3)*gradientSigmoid(z2Cur)
        
        
        a2Cur = np.insert(a2Cur, 0, 1, axis=0)

        D1 = D1 + delta2.dot(a1Cur.T)
                
        delta3 = delta3.reshape(len(delta3),1)
        D2 = D2 + delta3.dot(a2Cur.T)
        
    D1 = D1/m
    D2 = D2/m
    
    D1[:,1:] = D1[:,1:] + (float(lmbda)/m)*theta1[:,1:]
    D2[:,1:] = D2[:,1:] + (float(lmbda)/m)*theta2[:,1:]
    
    return flattenParams([D1, D2]).flatten()
       
def checkGradient(thetas,D,X,y,lmbda):
    epsilon = 0.0001
    Ds = np.r_[D[0].ravel(),D[1].ravel()]
    xT = flatten(X)
    n_elems = len(thetas) 
    thetas = thetas.reshape(len(thetas),1)
    #Pick ten random elements, compute numerical gradient, compare to respective D's
    for i in range(10):
        x = int(np.random.rand()*n_elems)
        epsvec = np.zeros((n_elems,1))
        epsvec[x] = epsilon
        cost_high = costFunction(thetas + epsvec, X, y, lmbda)
        cost_low  = costFunction(thetas - epsvec, X, y, lmbda)
        mygrad = (cost_high - cost_low) / float(2*epsilon)
        print ("Element: %d. Numerical Gradient = %f. BackProp Gradient = %f."%(x,mygrad,Ds[x]))
        
Ds = backProp(np.r_[Theta1.ravel(),Theta2.ravel()],X,y,0)
checkGradient(thetaUnrolled,[D1,D2],X,y,0)
        


Element: 920. Numerical Gradient = 0.000264. BackProp Gradient = 0.000264.
Element: 4770. Numerical Gradient = -0.000000. BackProp Gradient = -0.000000.
Element: 4938. Numerical Gradient = 0.000057. BackProp Gradient = 0.000057.
Element: 3793. Numerical Gradient = 0.000121. BackProp Gradient = 0.000121.
Element: 75. Numerical Gradient = 0.000121. BackProp Gradient = 0.000121.
Element: 7287. Numerical Gradient = 0.000037. BackProp Gradient = 0.000037.
Element: 7465. Numerical Gradient = 0.000189. BackProp Gradient = 0.000189.
Element: 3939. Numerical Gradient = 0.000107. BackProp Gradient = 0.000107.
Element: 684. Numerical Gradient = -0.000009. BackProp Gradient = -0.000009.
Element: 6539. Numerical Gradient = 0.000049. BackProp Gradient = 0.000049.


In [98]:
import scipy.optimize
import time
def trainNN(lmbda):
    generatedThetas = initRandomThetas()
    generatedThetas = generatedThetas.reshape(len(generatedThetas),1)
    
    result = scipy.optimize.fmin_cg(costFunction, fprime=backProp, x0=generatedThetas, \
                               args=(X,y,lmbda),maxiter=50,disp=True,full_output=True)
    return result[0]

start = time.time()
trainThetas = trainNN(0)
end = time.time()
print("Total time is %.2f seconds" %(end - start))

         Current function value: 0.020620
         Iterations: 50
         Function evaluations: 149
         Gradient evaluations: 149
Total time is 57.17 seconds


In [108]:
def predictNN(row, index):
    classes = np.arange(1,11) + [10]
    a3 = aVals[2].T
    a3Row = a3[index]
    rVal = classes[np.argmax(a3Row)]
    
    #print(rVal)

    return rVal - 10
     
def computeAccuracy(thetas, X, y):
    m = X.shape[0]
    numCorrect = 0
    costFunction(thetas, X, y, 0)
    for i in range(m):
        if int(predictNN(X[i],i) == int(y[i])):
            numCorrect += 1
    print("Training set accuracy is %.2f" %(100*(numCorrect/m)))
computeAccuracy(trainThetas,X,y)

Training set accuracy is 99.96
