In [2]:
# load "Iris_3class.csv" to Google Colab
from google.colab import files
uploaded = files.upload()

Saving Iris_3class.csv to Iris_3class.csv


In [3]:
import pandas as pd
raw_data = pd.read_csv("Iris_3class.csv", header = None)
raw_data.values.shape

(150, 5)

In [4]:
import numpy as np
raw_data = raw_data.values

X_train = raw_data[:,:4]
y_train = raw_data[:,4:5].astype(int)
print(X_train.shape, y_train.shape)
print(X_train.dtype, y_train.dtype)

(150, 4) (150, 1)
float64 int64


In [5]:
#Convert array to one-hot encoding
def to_one_hot(Y):
    n_col = np.amax(Y) + 1
    binarized = np.zeros((len(Y), n_col))
    for i in range(len(Y)):
        binarized[i, Y[i]] = 1.
    return binarized

In [6]:
y_train = to_one_hot(y_train)
print(X_train.shape, y_train.shape)
print(X_train.dtype, y_train.dtype)

(150, 4) (150, 3)
float64 float64


In [7]:
#verify one-hot encoding
y_train[0:5,:]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [26]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x) *(1-sigmoid (x))

def softmax(A):
    expA = np.exp(A)
    return expA / expA.sum(axis=1, keepdims=True)

def cross_entropy(predictions):
    predictions = np.clip(predictions, 1e-9, 1. - 1e-9)
    N = predictions.shape[0]
    ce = -np.sum(y_train*np.log(predictions+1e-9))/len(predictions)
    return ce


def ann(W, X_train, y_train):

  num_hidden = 5
  num_features = 4
  num_outputs = 3

  #Weights
  w0 = W[:20].reshape(num_features, num_hidden)
  w1 = W[20:].reshape(num_hidden, num_outputs)

  #Feed forward
  layer0 = X_train
  layer1 = sigmoid(np.dot(layer0, w0))
  layer2 = np.dot(layer1, w1)
  
  # softmax
  output = softmax(layer2)
 
  #Back propagation using gradient descent
  
  #cross-entropy loss
  error = cross_entropy(output)

  #initialize gradients to zero 
  dw0 = np.zeros(w0.shape)
  dw1 = np.zeros(w1.shape)

  #determine gradients
  dw1 += np.dot((layer1.T),(output-y_train))
  dw0 += np.dot(X_train.T, (np.dot((output-y_train),w1.T)*(sigmoid_der(np.dot(layer0,w0)))))

  #combine gradients into one vector
  dW = np.array(list(dw0.flatten()) + list(dw1.flatten()))

  return (error, dW, output)


In [27]:
num_hidden = 5
num_features = 4
num_outputs = 3
  
#initialize weights
w0 = 2*np.random.random((num_features, num_hidden)) - 1
w1 = 2*np.random.random((num_hidden, num_outputs)) - 1

#combine weights into a single vector
W = np.array(list(w0.flatten()) + list(w1.flatten()))

#train network
n = 0.001
iterations = 100000
errors = []
for i in range(iterations):
  (error, dW, y_pred) = ann(W, X_train, y_train)
  W += -dW * n
  errors.append(error)

In [28]:
#examine predictions on training data
np.round(y_pred, 1)

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. 

# Calculating numerical gradients

In [35]:

num_hidden = 5
num_features = 4
num_outputs = 3

#initialize weights
w0 = 2*np.random.random((num_features, num_hidden)) - 1
w1 = 2*np.random.random((num_hidden, num_outputs)) - 1

#combine weights
W = np.array(list(w0.flatten()) + list(w1.flatten()))

#compute gradients analytically
(error, dW, y_pred) = ann(W, X_train, y_train)
dW /= len(y_train)

#compute gradients numerically
dW_num = np.zeros((len(W),1))

e = 0.00001 #pertubation

for ind in range(len(W)):
  #reset gradients
  We1 = np.array(list(w0.flatten()) + list(w1.flatten()))
  We2 = np.array(list(w0.flatten()) + list(w1.flatten()))
  
  #increment slightly
  We1[ind] = We1[ind] + e
  We2[ind] = We2[ind] - e
  
  #compute errors
  (error_e1, dW_e1, y_pred1) = ann(We1, X_train, y_train)
  (error_e2, dW_e2, y_pred2) = ann(We2, X_train, y_train)
  
  #obtain numerical gradients
  grad_num = (error_e1 - error_e2) / (2*e)
  
  #display difference between numerical and analytic gradients
  print(round(abs(grad_num - dW[ind]), 4), grad_num, dW[ind])


0.0 0.13895969956934096 0.13895970022060414
0.0 -0.14704432197421013 -0.14704432308584345
0.0 0.0009934284728174703 0.000993428496217484
0.0 0.25069613155404014 0.2506961335868969
0.0 -0.009733861250360576 -0.00973386129440734
0.0 0.12675473807233217 0.12675473863713752
0.0 -0.07437539086829759 -0.07437539138055017
0.0 -8.800257189633952e-05 -8.800256905476275e-05
0.0 0.153993958895704 0.1539939599874585
0.0 -0.007766261722608191 -0.007766261756314029
0.0 -0.02530927269983607 -0.025309272792825264
0.0 -0.08174086862933194 -0.08174086931949247
0.0 0.0017020655773691826 0.0017020656024929867
0.0 0.07433419665359153 0.07433419769773117
0.0 -0.0005191210017407855 -0.0005191209924669629
0.0 -0.021658947935154767 -0.02165894805317724
0.0 -0.019032733178381278 -0.019032733359357006
0.0 0.000619593742978708 0.000619593744920776
0.0 0.0025857247010208084 0.0025857249443658583
0.0 0.000491270724101156 0.0004912707326097312
0.0 -0.055566978263144044 -0.055566978518321646
0.0 -0.005915586820393059