In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import tensorflow
import os
import matplotlib.pyplot as plt

In [2]:

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
os.chdir('/content/drive/MyDrive/Neural networks from scratch/')

Read the csv file that contains iris dataset

In [4]:
data = pd.read_csv('iris/iris.data', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 's_class'])

In [5]:
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,s_class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Create a new column called class that contains only numeric observations for each class

In [6]:
n_class = {'Iris-setosa': 1, 'Iris-versicolor': 2, 'Iris-virginica': 3}
data['class'] = data['s_class'].replace(n_class)

  data['class'] = data['s_class'].replace(n_class)


In [7]:
df = data[['sepal_length',	'sepal_width',	'petal_length',	'petal_width',	'class']].values

There are all functions used in this notebook

In [8]:
#Function to split the data into train ans test set

def split_train_test(df, size, class_index):

  n = np.round(df.shape[0]*size).astype('int')

  np.random.shuffle(df)

  X_train = df[:n, :class_index]

  X_test = df[n:, :class_index]

  y_train = df[:n, class_index]

  y_test = df[n:, class_index]

  print(f'Train dataset have {X_train.shape[0]} rows and test have {X_test.shape[0]} rows.')

  return X_train, X_test, y_train, y_test

# For the Weights we use gorot normal

def gorot_normal(n_weights, fan_in, fan_out, seed):

  np.random.seed(seed)

  limit = np.sqrt(6 / (fan_in + fan_out))

  random_weights = np.random.uniform(-limit, limit, n_weights)

  random_weights[:, 0] = np.abs(random_weights[:, 0])

  return random_weights

# Activation function ReLu

def ReLu(x):

  x = np.where(x < 0, 0, x)

  return x

# Derivative of activation function ReLu

def relu_derivative(x):

  x = np.where(x <= 0, 0, 1)

  return x

# Softmax function

def softmax(x):

  denom = np.sum(np.exp(x), axis=1).reshape(-1, 1)

  r = np.multiply(np.exp(x), 1/denom)

  return r

def categorical_cross_entropy(y_pred, y_one_hot):

  category = np.multiply(y_pred, y_one_hot)

  j = -np.log(np.sum(category, axis=1).reshape(1, -1))

  return np.sum(j)

def first_nn_training(X, y, lr, weights1, weights2):

  w1 = weights1

  w2 = weights2

  diff_w1 = []

  diff_w2 = []

  accuracy = []

  p_break = 1

  i = 1

  while (p_break > 1e-04):

    # We obtain the outputs of the respective layers

    # FORWARD PROPAGATION

    # First layer with ReLu activation, it means we multiply X values by the
    # weights and the result pass through a ReLu.

    a_1 = ReLu(np.dot(X, w1))

    # Same process as first layer but instead of use X, we are using the result
    # of the first layer multiplied by the respective weights

    a_2 = softmax(np.dot(a_1, w2))

    # We obtain the loss function, in this case categorical corssentropy

    cce = categorical_cross_entropy(a_2, y)

    # Accuracy

    diff = np.argmax(a_2, axis = 1)-np.argmax(y, axis = 1)
    acc = diff[diff == 0].shape[0]/diff.shape[0]

    # Calculation of the gradients, this is the product between the
    # derivative of the loss function and the softmax activation in the output
    # layer

    # BACKPROAGATION AND UPDATED WEIGHTS

    g = a_2 - y

    # This variable help us to compare the difference between weights and
    # configure a limit for stop the iterations.

    pre_w2 = w2

    # Next we calculate the weights gradient, this will be the input for update
    # the weights and it is simply multiply the gradient by the output of the
    # previous layer following the chain rule of differentiation.

    # Output layer weights gradient

    w2_g = np.dot(a_1.T, g)

    # The learning rate (lr) help us to control the influence of the gradient in the
    # updating process of the weights.

    w2 = w2 - lr*w2_g

    # Calculate again the gradient for propagate for the others layers,
    # again following the chain rule of differentiation we just multiply the
    # gradient by the original weights of the output layer.

    g = np.dot(g, pre_w2.T)

    # Gradient of the hidden layer

    g = np.multiply(relu_derivative(a_1), g)

    pre_w1 = w1

    # weights 1 gradient the last step in the chain rule in this
    # specific neural network

    w1_g = np.dot(X_train.T, g)

    # Update the weights

    w1 = w1 - lr*w1_g

    print(f'Accuracy in step {i + 1} is {acc}')

    # In here we calculate variables to stop the process

    diff_w1.append(np.average(np.abs(pre_w1 - w1)))

    diff_w2.append(np.average(np.abs(pre_w2 - w2)))

    p_break = (np.average(np.abs(pre_w1 - w1))+np.average(np.abs(pre_w2 - w2)))/2

    accuracy.append(acc)

    print(f'Average difference between weights for second layer {np.average(np.abs(pre_w2 - w2))} and fisrt layer {np.average(np.abs(pre_w1 - w1))}')

    i += 1

  return w1, w2, diff_w1, diff_w2, accuracy

def first_nn_test(X, y, weights1, weights2):

  # this function apply the updated weights for the test dataset

  a_1 = ReLu(np.dot(X, w1))

  a_2 = softmax(np.dot(a_1, w2))

  diff = np.argmax(a_2, axis = 1)-np.argmax(y, axis = 1)
  acc = diff[diff == 0].shape[0]/diff.shape[0]

  pred = np.argmax(a_2, axis = 1)+1

  print(f'Accuracy in test set is {acc*100}%')

  return pred



Take train and test data sets from the iris dataset

In [9]:
X_train, X_test, y_train, y_test = split_train_test(df, 0.8, 4)

Train dataset have 120 rows and test have 30 rows.


Truth values to one-hot encoding

In [10]:
# y to one-hot

unique_values = np.unique(y_train)

y_train_one_hot = (y_train[:, None] == unique_values).astype(int)

unique_values = np.unique(y_test)

y_test_one_hot = (y_test[:, None] == unique_values).astype(int)

In [11]:
# Neural network will have 1 hidden layer with 20 nodes and output layer with 3 the classes of the dataset

nodes_first = 20
nodes_second = 3
seed = 123456
lr = 0.0001

In [12]:
# First add the ones column to the df for the bias term.

ones = np.ones((X_train.shape[0], 1))

X_train = np.hstack((ones, X_train))

ones = np.ones((X_test.shape[0], 1))

X_test = np.hstack((ones, X_test))

In [13]:
# Calculate the random weights.

weights_1 = gorot_normal((X_train.shape[1], nodes_first+1), nodes_first, nodes_second, seed)

weights_2 = gorot_normal((nodes_first+1, nodes_second), nodes_first, nodes_second, seed)

In [14]:
w1, w2, diff_w1, diff_w2, accuracy = first_nn_training(X_train, y_train_one_hot, lr, weights_1, weights_2)

Accuracy in step 2 is 0.31666666666666665
Average difference between weights for second layer 0.005138695208360651 and fisrt layer 0.003838482345149462
Accuracy in step 3 is 0.31666666666666665
Average difference between weights for second layer 0.004527723623510301 and fisrt layer 0.0034031193060633084
Accuracy in step 4 is 0.31666666666666665
Average difference between weights for second layer 0.0038548958795456027 and fisrt layer 0.002811818331818252
Accuracy in step 5 is 0.31666666666666665
Average difference between weights for second layer 0.003139648376174368 and fisrt layer 0.0021123420435464326
Accuracy in step 6 is 0.31666666666666665
Average difference between weights for second layer 0.002486840528827372 and fisrt layer 0.0014390701600807195
Accuracy in step 7 is 0.31666666666666665
Average difference between weights for second layer 0.0020085837124157553 and fisrt layer 0.0010805649863360042
Accuracy in step 8 is 0.31666666666666665
Average difference between weights for s

In [15]:
first_nn_test(X_test, y_test_one_hot, w1, w2)

Accuracy in test set is 100.0%


array([2, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, 1, 2, 2, 3, 3, 3, 2, 2, 3, 3,
       1, 3, 1, 1, 1, 1, 1, 3])