In [2]:
"""
A simple numpy implementation of a XOR gate to understand the backpropagation
algorithm

"""

import numpy as np



In [3]:
"""
the input of the XOR gate is sorted in the matrix X and the desired output
in matrix Y
"""
X=np.array([[0,0],[0,1],[1,0],[1,1]])
Y=np.array([[0,1,1,0]]).T
print("Input")
print(X)
print("\nOutput")
print(Y)

Input
[[0 0]
 [0 1]
 [1 0]
 [1 1]]

Output
[[0]
 [1]
 [1]
 [0]]


In [4]:
m=X.shape[0] #initializing number of training examples
n=X.shape[1] #initializing number of features/input
hidden_s = 2 #initialzing hidden layer size
l_r = 1 #initializing learning rate for backpropagation

"""
We initialize the weights that our model learns first as matrices of random
variables which are then updated by backpropagation. 

generally the dimensions for each weight vector is: (size of current layer+1,size of next layer)
"""
theta1 = (np.random.random((n + 1, hidden_s)))
theta2 = (np.random.random((hidden_s + 1, 1)))

In [5]:
def sigmoid(z):
    """
    the sigmoid function is used as an activation fucntion to convert linear outputs
    to non linear outputs such that probability is outputed in between 0 and 1
    
    """
    return 1/(1+np.exp(-z))

In [6]:
def sigmoid_grad(z):
    """
    the derivative of the sigmoid function is computed.
    """
    s=sigmoid(z)
    return s*(1-s)

In [7]:
def forward_propagate(X,theta1,theta2):
    """
    This function propagates through the network computing the output of
    every layer of the neural network with the given inputs and weights and
    computes the final output of the xor gate
    
    """
    #first, a column of biases is added to the input of the first layer
    a1=np.c_[np.ones(X.shape[0]),X]#mxn+1
    #the weights of the first layer are multiplied by the input of the first layer
    z1=a1.dot(theta1)#mxhidden
    #the input of the second layer is the output of the first layer, passed through the activation function and column of biases is added 
    a2=np.c_[np.ones(X.shape[0]),sigmoid(z1)]#mxhidden+1
    #the input of the second layer is multiplied by the weights
    z3=a2.dot(theta2)
    #the output is passed through the activation function to obtain the final probability
    h3=sigmoid(z3)
    #print(h3.shape)
    return a1,z1,a2,z3,h3


In [9]:
"""
Performing backpropagation

For every iteration, we are calculating the error of each layer and updating the weights
appropriately so as to minimize this error.

"""

for i in range(1000):
    a1, z1, a2, z3, hyp = forward_propagate(X, theta1, theta2)# for every iteration, forward propagation is carried out
    del_2= Y-hyp#the error of the final layer is calculated- the difference between the predicted and actual output
    #the error of the previous layer is found by computing the dot product of the error of the previous layer and the weights of the second layer,without the column for biases.
    #this matrix is made to undergo element-wise multiplication with the output of the first layer(taking into account the activation function)
    del_1=del_2.dot(theta2[1:,:].T)

    #the error of the second layer is multiplied element wise by the sigmoid gradient of the output of the second layer
    delta2=del_2
    
    #the error of the first layer is multiplied element wise by the sigmoid gradient of the output of the first layer
    delta1=del_1*sigmoid_grad(z1)

    #the parameters are updated using gradient descent
    theta2+=l_r*a2.T.dot(delta2)
    theta1+=l_r*a1.T.dot(delta1)



In [None]:
a1, z1, a2, z3, hyp = forward_propagate(X, theta1, theta2) #running forward propagation with the updated weights

In [11]:
#displaying the predicted output of the model
print("\nPredicted Output")
print(hyp)


Predicted Output
[[ 0.02060586]
 [ 0.98041979]
 [ 0.98041576]
 [ 0.01658563]]
