In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io #Used to load the OCTAVE *.mat files
import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
import random #To pick random images to display
import math
import itertools
from scipy.special import expit #Vectorized sigmoid function
import array


In [7]:
dataFile = "data/ex4data1.mat"
data = scipy.io.loadmat(dataFile)


# X has 5000 examples and 400 features making it 5000x400
X = data['X']
y = data['y']

# training set / test set code
# shuffling data set because later on pd.get_dummies needs to identify 10 unique elements but can't if the data only contains 
# classifications for the first 6 digits
combined = np.append(X, y, axis=1)
np.random.shuffle(combined)

yComb = combined[:,400]
yComb = yComb.reshape(len(yComb),1)

Xcomb = combined[:,0:400]
Xcomb = Xcomb.reshape(len(Xcomb),400)

# will be using 70% of data to train hence 3500 x 400
Xtrain = Xcomb[0:3500,:]
Xtrain = np.insert(Xtrain, 0, 1, axis=1)
yTrain = yComb[0:3500]

# then the test set will be 1500x400
Xtest = Xcomb[3500:5001,:]
yTest = yComb[3500:5000]

#end of train/test set code

X = np.insert(X, 0, 1, axis = 1)



In [8]:
# taken from ex3
def reshapeRow(row):
    """
    @param {row} 1 x 401 matrix since an image of a digit is 20x20 + 1 that was added as a bias
    Function that takes in the pixel intensity values and puts it into a 20x20 square 
    """ 
    # the [1:] is used to take everything after the 1st index 
    
    return row[1:].reshape(20,20).T

def displayData(indiciesToDisplay = None):
    """
    Function that selects 100 random examples for the 5000 we have and organizes
    them into a 10x10 matrix
    """
    width = 20
    height = 20
    numRows = 10
    numCols = 10
    
    if not indiciesToDisplay:
        indiciesToDisplay = random.sample(range(X.shape[0]), numRows * numCols)

    
    bigPicture = np.zeros((height * numRows, width * numCols))
    
    iRow = 0
    iCol = 0

    for i in indiciesToDisplay :
        if iCol == numCols:
            iCol = 0
            iRow += 1
 
        curImg = reshapeRow(X[i])
        bigPicture[iRow * height :iRow * height + curImg.shape[0], 
                    iCol * width : iCol * width + curImg.shape[1]] = curImg
        iCol += 1 
    fig = plt.figure( figsize = (6,6) )
    img = scipy.misc.toimage( bigPicture )
    plt.imshow(img,cmap = cm.Greys_r)

#displayData()

In [9]:
thetaFile = "data/ex4weights.mat"
thetas1 = scipy.io.loadmat(thetaFile)

# Theta1.shape = 25 x 401
Theta1 = thetas1['Theta1']

# Theta2.shape = 10 x 26
Theta2 = thetas1['Theta2']

thetaUnrolled = np.r_[Theta1.ravel(), Theta2.ravel()]

inputSize = 400
hiddenSize = 25
outputSize = 10


# m = 5000
m = X.shape[0]
# n = 401
n = X.shape[1]

aVals = [None] * 3
zVals = [None] * 2

In [10]:
# the following 2 functions are needed because fmin_cg passes in X as an unrolled vector
def reshape(X, m, n):
    return np.array(X).reshape((m,n))

def flatten(X):
    return np.array(X.flatten()).reshape((X.shape[0]*(X.shape[1]),1))


# used to flatted D1, D2, taken from kaleko github
def flattenParams(thetas_list):
    input_layer_size = 400
    hidden_layer_size = 25
    output_layer_size = 10
    """
    Hand this function a list of theta matrices, and it will flatten it
    into one long (n,1) shaped numpy array
    """
    flattened_list = [ mytheta.flatten() for mytheta in thetas_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size+1)*hidden_layer_size + \
                            (hidden_layer_size+1)*output_layer_size
    return np.array(combined).reshape((len(combined),1))

def sigmoid(X, theta):
    return expit(np.dot(X,theta))

def costFunction(thetas, X, y, lmbda):
    
    X = reshape(X,3500,401)
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    m = X.shape[0]

    a1 = X # 5000 x 401
    z2 = theta1.dot(X.T)
    
    a2 = expit(z2) # 25 x 5000
    a2 = np.insert(a2,0,1,axis=0)
       
    z3 = theta2.dot(a2)
    a3 = expit(z3)
    
    aVals[0] = a1
    aVals[1] = a2
    aVals[2] = a3
    
    zVals[0] = z2
    zVals[1] = z3
     
    tempY = pd.get_dummies(y.ravel()).values
    
    J = (-1/m)*np.sum(np.log(a3.T)*(tempY)+np.log(1-a3).T*(1-tempY)) + \
    (lmbda/(2*m))*(np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))  
    
    return J

In [11]:
def testCost():
    J = costFunction(thetaUnrolled,Xtrain,yTrain,1)
    #print("Expected value for training set is ~ 1.83\nActual value %.9f"%J)

    #print("Expected value is ~ 0.383\nActual value %.9f"%J)
testCost()



In [219]:
def gradientSigmoid(z):
    return expit(z)*(1-expit(z))

def initRandomThetas():
    epsilon = 0.12
    theta1Size = (hiddenSize, inputSize + 1)
    theta2Size = (outputSize, hiddenSize + 1)
    
    
    theta1 = np.random.rand(*theta1Size).ravel()*2*epsilon - epsilon
    theta2 = np.random.rand(*theta2Size).ravel()*2*epsilon - epsilon
    
    
    thetas = np.r_[Theta1.ravel(), Theta2.ravel()]
    return thetas

temp = initRandomThetas()


(10285,)


In [13]:

def backProp(thetas,X,y, lmbda):
    X = reshape(X,3500,401)
    m = X.shape[0]
    
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    a1 = aVals[0]
    a2 = aVals[1].T
    a3 = aVals[2].T
    
    z2 = zVals[0].T
    z3 = zVals[1].T
    
        
    D1 = np.zeros((theta1.shape))
    D2 = np.zeros((theta2.shape))
    
    for i in range(m):
        a1Cur = X[i]
        a2Cur = a2[:,1:][i]
        z2Cur = z2[i]
        a3Cur = a3[i]
        z3Cur = z3[i]
    
        a1Cur = a1Cur.reshape(len(a1Cur),1)
        a2Cur = a2Cur.reshape(len(a2Cur),1)
        z2Cur = z2Cur.reshape(len(z2Cur),1)
        a3Cur = a3Cur.reshape(len(a3Cur),1)
        z3Cur = z3Cur.reshape(len(z3Cur),1)
        
        yT = np.zeros((10,1))
        yT[int(y[i]) - 1] = 1
                
        delta3 = a3Cur - yT
        delta2 = theta2[:,1:].T.dot(delta3)*gradientSigmoid(z2Cur)
        
        
        a2Cur = np.insert(a2Cur, 0, 1, axis=0)

        D1 = D1 + delta2.dot(a1Cur.T)
                
        delta3 = delta3.reshape(len(delta3),1)
        D2 = D2 + delta3.dot(a2Cur.T)
        
    D1 = D1/m
    D2 = D2/m
    
    D1[:,1:] = D1[:,1:] + (float(lmbda)/m)*theta1[:,1:]
    D2[:,1:] = D2[:,1:] + (float(lmbda)/m)*theta2[:,1:]
    
    return flattenParams([D1, D2]).flatten()
       
def checkGradient(thetas,D,X,y,lmbda):
    epsilon = 0.0001
    xT = flatten(X)
    n_elems = len(thetas) 
    thetas = thetas.reshape(len(thetas),1)
    #Pick ten random elements, compute numerical gradient, compare to respective D's
    for i in range(10):
        x = int(np.random.rand()*X.shape[0])
        epsvec = np.zeros((n_elems,1))
        epsvec[x] = epsilon
        cost_high = costFunction(thetas + epsvec, X, y, lmbda)
        cost_low  = costFunction(thetas - epsvec, X, y, lmbda)
        mygrad = (cost_high - cost_low) / float(2*epsilon)
        print ("Element: %d. Numerical Gradient = %f. BackProp Gradient = %f."%(x,mygrad,D[x]))
        

Ds = backProp(np.r_[Theta1.ravel(),Theta2.ravel()],Xtrain,yTrain,0)
checkGradient(thetaUnrolled,Ds,Xtrain,yTrain,0)
        


Element: 1404. Numerical Gradient = 0.000001. BackProp Gradient = 0.000001.
Element: 730. Numerical Gradient = 0.000249. BackProp Gradient = 0.000249.
Element: 1836. Numerical Gradient = -0.000410. BackProp Gradient = -0.000410.
Element: 1223. Numerical Gradient = 0.000000. BackProp Gradient = 0.000000.
Element: 3158. Numerical Gradient = -0.000157. BackProp Gradient = -0.000157.
Element: 969. Numerical Gradient = 0.000078. BackProp Gradient = 0.000078.
Element: 1342. Numerical Gradient = 0.000112. BackProp Gradient = 0.000112.
Element: 2933. Numerical Gradient = 0.000089. BackProp Gradient = 0.000089.
Element: 2960. Numerical Gradient = -0.000011. BackProp Gradient = -0.000011.
Element: 832. Numerical Gradient = -0.000001. BackProp Gradient = -0.000001.


In [14]:
import scipy.optimize
import time
def trainNN(lmbda):
    generatedThetas = initRandomThetas()
    generatedThetas = generatedThetas.reshape(len(generatedThetas),1)
    
    result = scipy.optimize.fmin_cg(costFunction, fprime=backProp, x0=generatedThetas, \
                               args=(Xtrain,yTrain,lmbda),maxiter=50,disp=True,full_output=True)
    return result[0]

start = time.time()
trainThetas = trainNN(0)
end = time.time()
print("Total time is %.2f seconds" %(end - start))

         Current function value: 0.029367
         Iterations: 50
         Function evaluations: 133
         Gradient evaluations: 133
Total time is 34.87 seconds


In [444]:
def predictNN(row, index):
    classes = np.arange(1,11) 
    a3 = aVals[2].T
    a3Row = a3[index]
    rVal = classes[np.argmax(a3Row)]
    return rVal 
    

def propogateForward(row,thetas):
    theta1 = thetas[0:(hiddenSize*(inputSize+1))].reshape(hiddenSize,(inputSize+1))
    theta2 = thetas[(hiddenSize*(inputSize+1)):].reshape(10,(hiddenSize+1))
    
    row = np.insert(row,0,1)
    z2 = theta1.dot(row)
    a2 = expit(z2)
    a2 = np.insert(a2,0,1) #Add the bias unit

    
    
    z3 = theta2.dot(a2)
    a3 = expit(z3)
    classes = np.arange(1,11) 
    rVal = classes[np.argmax(a3)]

    return rVal    
    
def computeAccuracy(thetas, X, y):
    m = X.shape[0]
    numCorrect = 0
    #costFunction(thetas, Xtest, yTest, 0)
    for i in range(m):
        if int(propogateForward(X[i],thetas) == int(y[i])):
            numCorrect += 1
    print("Training set accuracy is %.2f" %(100*(numCorrect/m)))
computeAccuracy(trainThetas,Xtest,yTest)


[1.14406686e-08 2.77085260e-06 2.77506390e-05 7.56977843e-11
 9.69643953e-05 1.15203435e-05 2.34320505e-04 3.76855427e-08
 7.98772151e-04 9.99989299e-01]


10

In [442]:
# redoing nn but using pytorch
import torch
import torch.tensor as tensor
import torch.nn as nn
from torch.autograd import Variable
import torch.optim
import torchvision.models as models
from torch.nn.parameter import Parameter
import torch.nn.functional as F



dataFile = "data/ex4data1.mat"
data = scipy.io.loadmat(dataFile)

# X has 5000 examples and 400 features making it 5000x400
X = data['X']
y = data['y']

#X = np.insert(X, 0, 1, axis = 1)

inputSize = 400
hiddenSize = 25
outputSize = 10

X = torch.tensor(X, dtype = torch.float)
X = Variable(X, requires_grad = False)

y = torch.tensor(y, dtype = torch.float)
y = Variable(y, requires_grad = True)

W1 = Variable(torch.randn(inputSize + 1, hiddenSize).type(torch.float), requires_grad = True)
W2 = Variable(torch.randn(hiddenSize + 1, outputSize).type(torch.float), requires_grad = True)

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.hidden = nn.Linear(inputSize,hiddenSize)
        self.output = nn.Linear(hiddenSize,outputSize)
        
        self.W1 = Parameter(torch.randn(inputSize,hiddenSize))
        self.W2 = Parameter(torch.randn(hiddenSize,outputSize))
        
    def forward(self,row):
        a2 = F.linear(torch.t(row),torch.t(self.W1))
        a3 = F.linear(a2,torch.t(self.W2))
        print(a3)
        return a3
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

nNet = Model()

input = X[0]
input = tensor(input.numpy().reshape(len(input),1))
out = nNet(input)



target = torch.randn(10)

params = list(nNet.parameters())

target = target.view(10,-1)

criterion = nn.MSELoss()

loss = criterion(out,target)



#nNet.zero_grad()

print('bias.grad before backward')
print(nNet.hidden.bias.grad)

loss.backward()

print('bias.grad after backward')
print(nNet.hidden.bias.grad)














#w1 = params[0].detach().numpy()
#w2 = params[1].detach().numpy()

#w1 = np.insert(w1, 0, 1, axis = 0)
#w2 = np.insert(w2, 0, 1, axis = 0)
#print(w1.shape)
#print(w2.shape)

#weights = np.r_[w1.ravel(),w2.ravel()]
#computeAccuracy(weights,X,y)


tensor([[-36.7358,  20.5157, -25.3529,  -9.4248, -46.0723,  -2.0947,  27.9001,
           0.6748,  32.1753, -69.0018]], grad_fn=<MmBackward>)
bias.grad before backward
None
bias.grad after backward
None


In [187]:
class Neural_Network(nn.Module):
    def __init__(self,):
        super(Neural_Network, self).__init__()
        self.inputSize = 400
        self.hiddenSize = 25
        self.outputSize = 10
        
        self.W1 = torch.randn(self.inputSize + 1, self.hiddenSize)
        self.W2 = torch.randn(self.hiddenSize + 1, self.outputSize)
        
        self.D1 = torch.randn(self.inputSize + 1, self.hiddenSize)
        self.D2 = torch.randn(self.hiddenSize + 1, self.outputSize)

    
    def forward(self, row):
            
        self.z2 = torch.matmul(row,self.W1)
        self.a2 = expit(self.z2)
        self.a2 = np.insert(self.a2, 0, 1, axis = 0)
        
        self.z3 = torch.matmul(self.a2, self.W2)
        self.a3 = expit(self.z3)
              
        classes = np.arange(1,11) 
        rVal = torch.tensor(classes[np.argmax(self.a3.numpy())])
        return rVal.item()
    
    def sigmoid(self, s):
        return 1 / (1 + torch.exp(-s))
    
    def sigmoidPrime(self, s):
        return s * (1-s)
    
    def backward(self, X, y, out, index):
        
        yT = np.zeros((10,1))
        yT[int(y[index]) - 1] = 1
        yT = torch.tensor(yT, dtype=torch.float)
        
        
        self.delta3 = out - yT

        self.delta2 = torch.matmul(self.W2[:1,:],self.delta3) * self.sigmoidPrime(self.z2)

        xTemp = X.numpy().reshape(len(X),1)
        xTemp = torch.tensor(xTemp, dtype = torch.float)
        self.D1 += torch.matmul(xTemp, self.delta2)
        
        
        tempZ2 = torch.tensor(self.z2.numpy().reshape(len(self.z2),1), dtype = torch.float)
        self.D2[1:,:] += torch.matmul(tempZ2, torch.t(self.delta3))
    
    def train(self, X, y, index):
        out = self.forward(X)
        self.backward(X, y, out, index)
    
    def saveWeights(self, model):
        self.D1 = self.D1/5000
        self.D1 = self.D2/5000
        torch.save(model, "NN")
        
    def predict(self):
        print ("Predicted data based on trained weights: ")
        print ("Input (scaled): \n" + str(X))
        print ("Output: \n" + str(self.forward(X)))
        

X.forward()
X.backward()
    
NN = Neural_Network()

tW1 = NN.W1.numpy()
tW2 = NN.W2.numpy()
t_theta = np.r_[tW1.ravel(),tW2.ravel()]

computeAccuracy(t_theta,X,y)
X_t = np.insert(X, 0, 1, axis = 1)
for i in range(X_t.shape[0]):  # trains the NN 1,000 times
    #print ("#" + str(i) + " Loss: " + str(torch.mean((y[i] - NN(X_t[i]))**2).detach().item()))  # mean sum squared loss
    NN.train(X_t[i], y, i)
print("Training Done")
    
NN.saveWeights(NN)


AttributeError: 'Tensor' object has no attribute 'forward'

In [181]:


computeAccuracy(t_theta,X,y)

Training set accuracy is 16.86
