In [25]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io #Used to load the OCTAVE *.mat files
import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
import random #To pick random images to display
import math
import itertools
from scipy.special import expit #Vectorized sigmoid function
import array


In [26]:
dataFile = "data/ex4data1.mat"
data = scipy.io.loadmat(dataFile)


# X has 5000 examples and 400 features making it 5000x400
X = data['X']
y = data['y']

# training set / test set code
# shuffling data set because later on pd.get_dummies needs to identify 10 unique elements but can't if the data only contains 
# classifications for the first 6 digits
combined = np.append(X, y, axis=1)
np.random.shuffle(combined)

yComb = combined[:,400]
yComb = yComb.reshape(len(yComb),1)

Xcomb = combined[:,0:400]
Xcomb = Xcomb.reshape(len(Xcomb),400)

# will be using 70% of data to train hence 3500 x 400
Xtrain = Xcomb[0:3500,:]
Xtrain = np.insert(Xtrain, 0, 1, axis=1)
yTrain = yComb[0:3500]

# then the test set will be 1500x400
Xtest = Xcomb[3500:5001,:]
yTest = yComb[3500:5000]

#end of train/test set code

X = np.insert(X, 0, 1, axis = 1)



In [27]:
# taken from ex3
def reshapeRow(row):
    """
    @param {row} 1 x 401 matrix since an image of a digit is 20x20 + 1 that was added as a bias
    Function that takes in the pixel intensity values and puts it into a 20x20 square 
    """ 
    # the [1:] is used to take everything after the 1st index 
    
    return row[1:].reshape(20,20).T

def displayData(indiciesToDisplay = None):
    """
    Function that selects 100 random examples for the 5000 we have and organizes
    them into a 10x10 matrix
    """
    width = 20
    height = 20
    numRows = 10
    numCols = 10
    
    if not indiciesToDisplay:
        indiciesToDisplay = random.sample(range(X.shape[0]), numRows * numCols)

    
    bigPicture = np.zeros((height * numRows, width * numCols))
    
    iRow = 0
    iCol = 0

    for i in indiciesToDisplay :
        if iCol == numCols:
            iCol = 0
            iRow += 1
 
        curImg = reshapeRow(X[i])
        bigPicture[iRow * height :iRow * height + curImg.shape[0], 
                    iCol * width : iCol * width + curImg.shape[1]] = curImg
        iCol += 1 
    fig = plt.figure( figsize = (6,6) )
    img = scipy.misc.toimage( bigPicture )
    plt.imshow(img,cmap = cm.Greys_r)

#displayData()

In [28]:
thetaFile = "data/ex4weights.mat"
thetas1 = scipy.io.loadmat(thetaFile)

# Theta1.shape = 25 x 401
Theta1 = thetas1['Theta1']

# Theta2.shape = 10 x 26
Theta2 = thetas1['Theta2']

thetaUnrolled = np.r_[Theta1.ravel(), Theta2.ravel()]

inputSize = 400
hiddenSize = 25
outputSize = 10


# m = 5000
m = X.shape[0]
# n = 401
n = X.shape[1]

aVals = [None] * 3
zVals = [None] * 2

In [29]:
# the following 2 functions are needed because fmin_cg passes in X as an unrolled vector
def reshape(X, m, n):
    return np.array(X).reshape((m,n))

def flatten(X):
    return np.array(X.flatten()).reshape((X.shape[0]*(X.shape[1]),1))


# used to flatted D1, D2, taken from kaleko github
def flattenParams(thetas_list):
    input_layer_size = 400
    hidden_layer_size = 25
    output_layer_size = 10
    """
    Hand this function a list of theta matrices, and it will flatten it
    into one long (n,1) shaped numpy array
    """
    flattened_list = [ mytheta.flatten() for mytheta in thetas_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size+1)*hidden_layer_size + \
                            (hidden_layer_size+1)*output_layer_size
    return np.array(combined).reshape((len(combined),1))

def sigmoid(X, theta):
    return expit(np.dot(X,theta))

def costFunction(thetas, X, y, lmbda):
    
    X = reshape(X,5000,401)
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    m = X.shape[0]

    a1 = X # 5000 x 401
    z2 = theta1.dot(X.T)
    
    a2 = expit(z2) # 25 x 5000
    a2 = np.insert(a2,0,1,axis=0)
       
    z3 = theta2.dot(a2)
    a3 = expit(z3)
    
    aVals[0] = a1
    aVals[1] = a2
    aVals[2] = a3
    
    zVals[0] = z2
    zVals[1] = z3
     
    tempY = pd.get_dummies(y.ravel()).values
    
    J = (-1/m)*np.sum(np.log(a3.T)*(tempY)+np.log(1-a3).T*(1-tempY)) + \
    (lmbda/(2*m))*(np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))  
    
    return J

In [30]:
def testCost():
    J = costFunction(thetaUnrolled,X,y,1)
    #print("Expected value for training set is ~ 1.83\nActual value %.9f"%J)

    #print("Expected value is ~ 0.383\nActual value %.9f"%J)
testCost()



In [31]:
def gradientSigmoid(z):
    return expit(z)*(1-expit(z))

def initRandomThetas():
    epsilon = 0.12
    theta1Size = (hiddenSize, inputSize + 1)
    theta2Size = (outputSize, hiddenSize + 1)
    
    
    theta1 = np.random.rand(*theta1Size).ravel()*2*epsilon - epsilon
    theta2 = np.random.rand(*theta2Size).ravel()*2*epsilon - epsilon
    
    
    thetas = np.r_[Theta1.ravel(), Theta2.ravel()]
    return thetas

temp = initRandomThetas()


In [34]:

def backProp(thetas,X,y, lmbda):
    X = reshape(X,5000,401)
    m = X.shape[0]
    
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    a1 = aVals[0]
    a2 = aVals[1].T
    a3 = aVals[2].T
    
    z2 = zVals[0].T
    z3 = zVals[1].T
    
        
    D1 = np.zeros((theta1.shape))
    D2 = np.zeros((theta2.shape))
    
    for i in range(m):
        a1Cur = X[i]
        a2Cur = a2[:,1:][i]
        z2Cur = z2[i]
        a3Cur = a3[i]
        z3Cur = z3[i]
    
        a1Cur = a1Cur.reshape(len(a1Cur),1)
        a2Cur = a2Cur.reshape(len(a2Cur),1)
        z2Cur = z2Cur.reshape(len(z2Cur),1)
        a3Cur = a3Cur.reshape(len(a3Cur),1)
        z3Cur = z3Cur.reshape(len(z3Cur),1)
        
        yT = np.zeros((10,1))
        yT[int(y[i]) - 1] = 1
                
        delta3 = a3Cur - yT
        delta2 = theta2[:,1:].T.dot(delta3)*gradientSigmoid(z2Cur)
        
        
        a2Cur = np.insert(a2Cur, 0, 1, axis=0)

        D1 = D1 + delta2.dot(a1Cur.T)
                
        delta3 = delta3.reshape(len(delta3),1)
        D2 = D2 + delta3.dot(a2Cur.T)
        
    D1 = D1/m
    D2 = D2/m
    
    D1[:,1:] = D1[:,1:] + (float(lmbda)/m)*theta1[:,1:]
    D2[:,1:] = D2[:,1:] + (float(lmbda)/m)*theta2[:,1:]
    
    return flattenParams([D1, D2]).flatten()
       
def checkGradient(thetas,D,X,y,lmbda):
    epsilon = 0.0001
    xT = flatten(X)
    n_elems = len(thetas) 
    thetas = thetas.reshape(len(thetas),1)
    #Pick ten random elements, compute numerical gradient, compare to respective D's
    for i in range(10):
        x = int(np.random.rand()*X.shape[0])
        epsvec = np.zeros((n_elems,1))
        epsvec[x] = epsilon
        cost_high = costFunction(thetas + epsvec, X, y, lmbda)
        cost_low  = costFunction(thetas - epsvec, X, y, lmbda)
        mygrad = (cost_high - cost_low) / float(2*epsilon)
        print ("Element: %d. Numerical Gradient = %f. BackProp Gradient = %f."%(x,mygrad,D[x]))
        

Ds = backProp(np.r_[Theta1.ravel(),Theta2.ravel()],X,y,0)
checkGradient(thetaUnrolled,Ds,X,y,0)
        


Element: 2111. Numerical Gradient = -0.000089. BackProp Gradient = -0.000089.
Element: 2628. Numerical Gradient = 0.000064. BackProp Gradient = 0.000064.
Element: 4135. Numerical Gradient = 0.000145. BackProp Gradient = 0.000145.
Element: 2346. Numerical Gradient = -0.000000. BackProp Gradient = -0.000000.
Element: 579. Numerical Gradient = -0.000004. BackProp Gradient = -0.000004.
Element: 3898. Numerical Gradient = 0.000070. BackProp Gradient = 0.000070.
Element: 276. Numerical Gradient = 0.000115. BackProp Gradient = 0.000115.
Element: 4837. Numerical Gradient = -0.000001. BackProp Gradient = -0.000001.
Element: 4442. Numerical Gradient = -0.000001. BackProp Gradient = -0.000001.
Element: 1530. Numerical Gradient = -0.000215. BackProp Gradient = -0.000215.


In [35]:
import scipy.optimize
import time
def trainNN(lmbda):
    generatedThetas = initRandomThetas()
    generatedThetas = generatedThetas.reshape(len(generatedThetas),1)
    
    result = scipy.optimize.fmin_cg(costFunction, fprime=backProp, x0=generatedThetas, \
                               args=(X,y,lmbda),maxiter=50,disp=True,full_output=True)
    return result[0]

start = time.time()
trainThetas = trainNN(0)
end = time.time()
print("Total time is %.2f seconds" %(end - start))

         Current function value: 0.021647
         Iterations: 50
         Function evaluations: 149
         Gradient evaluations: 149
Total time is 49.29 seconds


In [37]:
def predictNN(row, index):
    classes = np.arange(1,11) 
    a3 = aVals[2].T
    a3Row = a3[index]
    rVal = classes[np.argmax(a3Row)]
    return rVal 
    

def propogateForward(row,thetas):
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    #row = np.insert(row,0,1)
    z2 = theta1.dot(row)
    a2 = expit(z2)
    a2 = np.insert(a2,0,1) #Add the bias unit

    
    
    z3 = theta2.dot(a2)
    a3 = expit(z3)
    print(a3)
    classes = np.arange(1,11) 
    rVal = classes[np.argmax(a3)]

    return rVal    
    
def computeAccuracy(thetas, X, y):
    m = X.shape[0]
    numCorrect = 0
    #costFunction(thetas, Xtest, yTest, 0)
    for i in range(m):
        if int(propogateForward(X[i],thetas) == int(y[i])):
            numCorrect += 1
    print("Training set accuracy is %.2f" %(100*(numCorrect/m)))
#computeAccuracy(trainThetas,Xtest,yTest)
propogateForward(X[0],trainThetas)

[2.37623443e-08 1.85503877e-05 8.21986268e-05 3.31721182e-12
 8.35657125e-05 5.14301574e-06 2.03117635e-05 3.37015530e-08
 9.27134629e-04 9.99989807e-01]


10

In [48]:
# redoing nn but using pytorch
import torch
import torch.tensor as tensor
import torch.nn as nn
from torch.autograd import Variable
import torch.optim
import torchvision.models as models
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import sys




dataFile = "data/ex4data1.mat"
data = scipy.io.loadmat(dataFile)

# X has 5000 examples and 400 features making it 5000x400
X_torch = data['X']
y_torch = data['y']


inputSize = 400
hiddenSize = 25
outputSize = 10

X_torch = torch.tensor(X_torch, dtype = torch.float)
X_torch = Variable(X_torch, requires_grad = False)

y_torch = torch.tensor(y_torch, dtype = torch.float)
y_torch = Variable(y_torch, requires_grad = True)

m_torch = X_torch.shape[0]

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.hidden = nn.Linear(inputSize,hiddenSize)
        self.output = nn.Linear(hiddenSize,outputSize)
       
    
    def forward(self,x):
        xT = F.relu(self.hidden(torch.t(x)))
        y_pred = self.output(xT)
        return y_pred

nNet = Model()
nNet.parameters()

print(nNet)


modelSeq = torch.nn.Sequential(
                    torch.nn.Linear(inputSize, hiddenSize),
                    torch.nn.ReLU(),
                    torch.nn.Linear(hiddenSize, outputSize))


loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(modelSeq.parameters(), lr=0.0003)

classes = np.arange(1,11) 
#rVal = classes[np.argmax(hi.detach().numpy())]
#print(rVal)

def train():
    for i in range(m_torch):
        optimizer.zero_grad()

        y_pred = modelSeq(X_torch[i])
        loss = loss_fn(y_pred,y_torch[i])
        loss.backward()
        optimizer.step()

        if (i == m_torch - 1):
            print("#%d - %.9f" %(i, loss.item()))

            
            
            
start = time.time()
for i in range(100):
    sys.stdout.write("Iteration %d "%i)
    sys.stdout.flush()
    train()
end = time.time()
print("Training Completed- Total time is %f seconds" %(end-start))
        

Model(
  (hidden): Linear(in_features=400, out_features=25, bias=True)
  (output): Linear(in_features=25, out_features=10, bias=True)
)
Iteration 0 #4999 - 2.184286594
Iteration 1 #4999 - 2.175432444
Iteration 2 #4999 - 1.160023808
Iteration 3 #4999 - 0.414475501
Iteration 4 #4999 - 0.161665350
Iteration 5 #4999 - 0.078226291
Iteration 6 #4999 - 0.042344250
Iteration 7 #4999 - 0.021165064
Iteration 8 #4999 - 0.011189546
Iteration 9 #4999 - 0.007218057
Iteration 10 #4999 - 0.003128105
Iteration 11 #4999 - 0.001192358
Iteration 12 #4999 - 0.000356788
Iteration 13 #4999 - 0.000977896
Iteration 14 #4999 - 0.003494910
Iteration 15 #4999 - 0.007234833
Iteration 16 #4999 - 0.009599572
Iteration 17 #4999 - 0.016272668
Iteration 18 #4999 - 0.027236119
Iteration 19 #4999 - 0.037949860
Iteration 20 #4999 - 0.057277553
Iteration 21 #4999 - 0.091280200
Iteration 22 #4999 - 0.089603834
Iteration 23 #4999 - 0.106563091
Iteration 24 #4999 - 0.110936083
Iteration 25 #4999 - 0.113570169
Iteration 26 #49

In [56]:
count = 0
classes = np.arange(1,11) 

for i in range(m_torch):
    expected = modelSeq(X_torch[i])

    index = classes[np.argmax(expected.detach().numpy())]
    if (index == y_torch[i]):
        count += 1
        
print(count/m * 100)



12.32
