# Deep Learning for Computer Vision:  Multi-class neural network


Let's start by importing some libraries.

In [None]:
import math
import plotly.express as px
import pandas as pd
from sklearn import datasets
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Let's make up our 2D data for our three classes.

In [None]:
data = pd.DataFrame(np.zeros((5000, 3)), columns=['x1', 'x2', 'y'])

# Let's make up some noisy XOR data to use to build our binary classifier
for i in range(len(data.index)):
    x1 = random.randint(0,1)
    x2 = random.randint(0,1)
    if x1 == 1 and x2 == 0:
        y = 0
    elif x1 == 0 and x2 == 1:
        y = 0
    elif x1 == 0 and x2 == 0:
        y = 1
    else:
        y = 2
    x1 = 1.0 * x1 + 0.20 * np.random.normal()
    x2 = 1.0 * x2 + 0.20 * np.random.normal()
    data.iloc[i,0] = x1
    data.iloc[i,1] = x2
    data.iloc[i,2] = y
    
for i in range(int(0.25 *len(data.index))):
    k = np.random.randint(len(data.index)-1)  
    data.iloc[k,0] = 1.5 + 0.20 * np.random.normal()
    data.iloc[k,1] = 1.5 + 0.20 * np.random.normal()
    data.iloc[k,2] = 1

for i in range(int(0.25 *len(data.index))):
    k = np.random.randint(len(data.index)-1)  
    data.iloc[k,0] = 0.5 + 0.20 * np.random.normal()
    data.iloc[k,1] = -0.75 + 0.20 * np.random.normal()
    data.iloc[k,2] = 2
    
# Now let's normalize this data.
data.iloc[:,0] = (data.iloc[:,0] - data['x1'].mean()) / data['x1'].std()
data.iloc[:,1] = (data.iloc[:,1] - data['x2'].mean()) / data['x2'].std()
        
data.head()

Let's message this data into a numpy format.

In [None]:
# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]

# The cost function is expecting numpy matrices so we need to convert X and y before we can use them.  
X = np.matrix(X.values)
y = np.matrix(y.values)
print(X)
print(y)

Let's make a sloppy plotting function for our binary data.

In [None]:
# Sloppy function for plotting our data
def plot_data(X, y_predict):
    
    fig, ax = plt.subplots(figsize=(12,8))
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    indices_0 = [k for k in range(0, X.shape[0]) if y_predict[k] == 0]
    indices_1 = [k for k in range(0, X.shape[0]) if y_predict[k] == 1]
    indices_2 = [k for k in range(0, X.shape[0]) if y_predict[k] == 2]

    ax.plot(X[indices_0, 0], X[indices_0,1], marker='o', linestyle='', ms=5, label='0')
    ax.plot(X[indices_1, 0], X[indices_1,1], marker='o', linestyle='', ms=5, label='1')
    ax.plot(X[indices_2, 0], X[indices_2,1], marker='o', linestyle='', ms=5, label='2')

    ax.legend()
    ax.legend(loc=2)
    ax.set_xlabel('x1')
    ax.set_ylabel('x2')
    ax.set_title('Tricky 3 Class Classification')
    plt.gca().set_aspect('equal', adjustable='box')
    
    plt.show()

Now let's plot it.

In [None]:
plot_data(X, y)

# Neural Network Modules from scratch

## Layer classes

RELU as activation function

In [None]:
L_RATE = 0.01

# General template for a layer (forward backward functions + temporary state variables)
class Module:
    def __init__(self):
        self.prev = None # previous network (linked list of layers)
        self.output = None # output of forward call for backprop.
        self.b_vec = None
        self.W_mat = None



    def forward(self, *input):
        raise NotImplementedError

    def backwards(self, *input):
        raise NotImplementedError

# linear (i.e. linear transformation) layer
class Linear:
    def __init__(self, input_size, output_size, is_input=False):
        self.prev = None # previous network (linked list of layers)
        self.output = None # output of forward call for backprop.
        self.is_input = is_input

        self.input = None
        
        # initialize weights and biases  
        # https://www.kdnuggets.com/2018/06/deep-learning-best-practices-weight-initialization.html
        # All literature I read said to initiaze biases to 0.1 for optimal performance
        # 1, output 
        self.b_vec = np.zeros((1,output_size)) + 0.5

        #* np.sqrt(2 / (input_size + output_size))

        # Divide weight initialization by output size in order to help network converge
        #    when using MSE as loss function in part E because otherwise weights would get too big
        # input, output
        he_init = math.sqrt(6/input_size)
        #self.W_mat = np.random.uniform(-.25, 0.75, (input_size, output_size)) 
        self.W_mat = np.random.rand(input_size, output_size)  *2 - 1


    def forward(self, input, loss):  # input has shape (batch_size, input_size)
        # todo compute forward pass through linear input

        if not self.is_input:
            input = self.prev.forward(input, loss)

        self.input = input

        output = np.empty((input.shape[0], self.W_mat.shape[1]))

        for i, mat in enumerate(input):
            output[i] = self.W_mat.T @ mat + self.b_vec

        self.output = output
        loss.weights_array.append(self.W_mat)
        
        return self.output


    def backwards(self, gradient):

        dW =  self.input.T @ gradient


        dW = np.mean(dW, axis = 0).reshape(1,-1)

        # print("dW ", dW.shape)
        db = np.mean(gradient, axis = 0).reshape(1,-1)
        # print("db", db.shape)

        new_grad = gradient @ self.W_mat.T


        self.W_mat -= L_RATE * dW
        self.b_vec -= L_RATE * db
        
        # print("new_grad  ", new_grad.shape)
        # print("------")
        if self.is_input:
            return gradient

        return self.prev.backwards(new_grad) 

# ReLu non-linearity Layer
class RELU(Module):
    def __init__(self):
        super(RELU, self).__init__()

    def forward(self, input, loss):
        # computes RELU and  updates fields
        input = self.prev.forward(input, loss)
        self.output = np.maximum(input,0)
        return self.output

    def backwards(self, gradient):
        # computes gradients with backpropogation and data from forward pass
        grad_sig = np.where(self.output > 0, 1, 0)


        new_gradient = np.multiply(grad_sig,gradient)

        return self.prev.backwards(new_gradient)

# sigmoid non-linearity TO BE USED AS ACTIVATION FCN
class Sigmoid(Module):
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, input):
        # todo. compute sigmoid, update fields
        input = self.prev.forward(input)
        self.output = (1 / (1 + np.exp(-input)))
        return self.output

    def backwards(self, gradient):
        # compute gradients with backpropogation and data from forward pass
        grad_sig = self.output * (1 - self.output)

        new_gradient = np.multiply(grad_sig,gradient)

        # print("new_grad_sig ", new_gradient.shape)
        # print("------")
        return self.prev.backwards(new_gradient)

    

"""DEALING WITH LOSS"""

# generic loss layer for loss functions
class Loss:
    def __init__(self):
        self.prev = None

    def __call__(self, input):
        self.prev = input
        return self

    def forward(self, input, labels):
        raise NotImplementedError

    def backwards(self):
        raise NotImplementedError

class SoftMaxLoss(Loss):
    def __init__(self):
        super(SoftMaxLoss, self).__init__()
        self.labels = None
        self.output = None
        self.mean_loss = None
        self.weights_array = list()
        self.alpha = 0.0

    def forward(self, input, labels):
        # input has shape (batch_size, input_size)

        # Forward pass each input in batch
        predictions_batch = self.prev.forward(input, self)
        self.labels = labels

        predicted_exp = np.exp(predictions_batch)

        denom = np.matrix(predicted_exp.sum(1)).T

        self.output = (predicted_exp/denom)


        z_is = list()
        
        for i, row in enumerate(predictions_batch):
            z_i = np.squeeze(row[labels[i]])

            self.output[i,[labels[i]]] -= 1

            z_is.append(z_i)
        
        z_is = np.array(z_is)



        # Add regularizatoin by looking at weights matrices of each lin layer
        L2_reg = 0
        for weight_mat in self.weights_array:
            L2_reg += np.linalg.norm(weight_mat) ** 2
        
        #reset weights array and get L2 Reg
        self.weights_array = list()
        L2_reg = (self.alpha/2) * L2_reg

        # Calculate Total Loss
        total_loss = (-z_is + np.log(predicted_exp.sum(1)) + L2_reg)
        
        # Average Loss over batch size
        self.mean_loss = np.average(total_loss, axis=0)
        return self.mean_loss


    def backwards(self):
        # todo compute gradient using backpropogation

        # print("Loss ", self.output.shape)
        return self.prev.backwards(self.output) 
    
    def predict(self, input):
        predictions_batch = self.prev.forward(input, self)

        predicted_exp = np.exp(predictions_batch)
        denom = np.matrix(predicted_exp.sum(1)).T
        self.output = (predicted_exp/denom)
        return self.output          


# Softmax where x_arr is array to softmax on, and y_i 
def softmax(x_arr, y_i):
    x_exp =  np.exp(x_arr)
    return (x_exp[:,y_i]/x_exp.sum(1))






## MLP as own class

In [None]:
# https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
def unison_shuffled_copies(a, b):
    p = np.random.permutation(len(a))
    return a[p], b[p]

## overall neural network class as asked for in part (a)
class MLP:
    def __init__(self):
        self.l_final = None
        self.l_output = None
        self.predict = None
        self.loss = None
        self.weight_mats = None


    def add_layer(self, layer_type, dim_in, dim_out):

        # Add hidden layer
        if layer_type == 'Hidden':
            # If first input layer, have some extra logic to make backprop work
            if self.l_final is None:
                l_input = Linear(dim_in, dim_out, is_input = True)
            else:
                l_input = Linear(dim_in, dim_out)
                l_input.prev = self.l_final

            l_hidden = RELU()
            l_hidden.prev = l_input
            self.l_final = l_hidden 

        if layer_type == 'Output':
            l_final = Linear(dim_in, dim_out)
            l_final.prev = self.l_final
            self.l_final = l_final

        if layer_type == 'Loss':
            loss = SoftMaxLoss()
            loss.prev = self.l_final
            self.loss = loss



    def forward(self, input, labels):

        mean_loss = self.loss.forward(input, labels)

        return mean_loss


    def backwards(self):
        return self.loss.backwards()

    def train(self, data, labels, epochs=100, bsize=8, alpha=0.0, early_stop=False, patience=2):

        self.loss.alpha = alpha

        data = np.array(data)
        labels = labels.astype(int)

        batches = math.ceil(data.shape[0] / bsize)

        mean_last_epoch_loss = [np.inf]

        for epoch in range(epochs):
            # Shuffling data each epoch shown to increase convergance 
            shuffled_data, shuffled_labels = unison_shuffled_copies(data, labels)

            
            epoch_loss = 0
            for bi in range(batches):
                # Gets minibatch
                mbidx = bi*bsize
                mb_data = shuffled_data[mbidx:mbidx + bsize]


                mb_labels = shuffled_labels[mbidx:mbidx + bsize]

                # Forwardpass
                epoch_loss += self.forward(mb_data, mb_labels)


                # Backprop with Adam optimized loss function
                self.backwards()
            modulo = epochs //5
            if modulo == 0:
                modulo = 1
            if epoch % (modulo) == 0:
                print(f"Epoch: {epoch + 1}/{epochs} -- Mean Loss: {epoch_loss * bsize / data.shape[0]}")

            mean_epoch_loss = epoch_loss * bsize / data.shape[0]


            
            mean_last_epoch_loss.append(mean_epoch_loss)
        return mean_last_epoch_loss

    def train_with_val(self, X_train, y_train, X_val, y_val, epochs=100, bsize=8, alpha=0.0, patience=2):
        
        self.loss.alpha = alpha

        X_train = np.array(X_train)
        X_val = np.array(X_val)
        y_train = y_train.astype(int)
        y_val = y_val.astype(int)

        batches = math.ceil(X_train.shape[0] / bsize)
        batches_val = math.ceil(X_val.shape[0] / bsize)

        mean_last_epoch_loss = [np.inf]

        for epoch in range(epochs):
            # Shuffling data each epoch shown to increase convergance 
            X_train_shuf, y_train_shuf = unison_shuffled_copies(X_train, y_train)
            X_val_shuf, y_val_shuf = unison_shuffled_copies(X_val, y_val)

            
            epoch_loss = 0
            for bi in range(batches):
                # Gets minibatch
                mbidx = bi*bsize
                mb_data = X_train_shuf[mbidx:mbidx + bsize]


                mb_labels = y_train_shuf[mbidx:mbidx + bsize]

                # Forwardpass
                self.forward(mb_data, mb_labels)


                # Backprop with Adam optimized loss function
                self.backwards()

            for bi in range(batches_val):
                mbidx = bi*bsize
                mb_data = X_val_shuf[mbidx:mbidx + bsize]


                mb_labels = y_val_shuf[mbidx:mbidx + bsize]

                # Forwardpass
                epoch_loss += self.forward(mb_data, mb_labels)


                # Backprop with Adam optimized loss function
                self.backwards()


            modulo = epochs //3
            if modulo == 0:
                modulo = 1
            if epoch % (modulo) == 0:
                print(f"Epoch: {epoch + 1}/{epochs} -- Mean Loss: {epoch_loss * bsize / data.shape[0]}")

            mean_epoch_loss = epoch_loss * bsize / data.shape[0]

            #Early stopping
            if epoch > patience and (mean_epoch_loss > mean_last_epoch_loss[-patience]):
                break
            
            mean_last_epoch_loss.append(mean_epoch_loss)
        return mean_last_epoch_loss


    def predict_in(self, data):
        outputs = self.loss.predict(data)
        prediction = np.argmax(outputs, axis=1)

        return prediction





In [None]:

def plot_loss(loss):
    x = np.arange(1, len(loss))
    y = np.array(loss)[1:]

    
    # plotting
    plt.title("Mean Loss by Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Mean Loss")
    plt.plot(x, y, color ="purple")
    plt.show()


## Run Network

In [None]:
L_RATE = 0.005
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=1)
NN.add_layer('Output', dim_in=1, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.0)
plot_loss(loss)


## Decision regions of trained classifier for 3 labels (1 hidden unit, 1 hidden layer, regularizatoin = 0)

In [None]:

def plot_decision_regions(NN):
    # Densely generate points in plane
    axis1 = np.arange(-2.2,2.2, 0.01)
    axis2 = np.arange(-2.2,2.2, 0.01)

    # Classify points with binary labels
    data = np.array(np.meshgrid(axis1, axis2)).T.reshape(-1,2)


    predictions = NN.predict_in(data)

    
    
    # PLOT
    table = np.concatenate((data,predictions), axis=1)
            
    df_sampled_points = pd.DataFrame(table, columns=[ 'x1', 'x2', 'label'])

    
    fig = px.scatter(df_sampled_points, x="x1", y="x2", color="label",
                title=f"Decision region"
                )
    fig.update_yaxes(scaleanchor = "x",scaleratio = 1,)
    fig.show()

In [None]:
plot_decision_regions(NN)

In [None]:


# 8 hidden units
L_RATE = 0.005
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=8)
NN.add_layer('Output', dim_in=8, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=200, bsize=20, alpha=0.0)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:

# 16 hidden units
L_RATE = 0.005
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=16)
NN.add_layer('Output', dim_in=16, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=200, bsize=20, alpha=0.0)
plot_loss(loss)
plot_decision_regions(NN)

## 3 Hidden Layer, no regularization 

In [None]:
# 3 hidden units


L_RATE = 0.004
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=3)
NN.add_layer('Hidden', dim_in=3, dim_out=3)
NN.add_layer('Hidden', dim_in=3, dim_out=3)
NN.add_layer('Output', dim_in=3, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=200, bsize=20, alpha=0.0)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:

# 8 hidden units
L_RATE = 0.004
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=8)
NN.add_layer('Hidden', dim_in=8, dim_out=8)
NN.add_layer('Hidden', dim_in=8, dim_out=8)
NN.add_layer('Output', dim_in=8, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=200, bsize=4, alpha=0.0)
plot_loss(loss)
plot_decision_regions(NN)

In [None]:

# 16 hidden units
L_RATE = 0.004
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=16)
NN.add_layer('Hidden', dim_in=16, dim_out=16)
NN.add_layer('Hidden', dim_in=16, dim_out=16)
NN.add_layer('Output', dim_in=16, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=150, bsize=8, alpha=0.0)
plot_loss(loss)
plot_decision_regions(NN)

## 1 Hidden Layer, 0.005 regularization 

In [None]:
# 3 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=3)
NN.add_layer('Output', dim_in=3, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.005)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:


# 8 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=8)
NN.add_layer('Output', dim_in=8, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.005)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:

# 16 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=16)
NN.add_layer('Output', dim_in=16, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=50, bsize=8, alpha=0.005)
plot_loss(loss)
plot_decision_regions(NN)

## 3 Hidden Layer, 0.005 regularization 

In [None]:
# 3 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=3)
NN.add_layer('Hidden', dim_in=3, dim_out=3)
NN.add_layer('Hidden', dim_in=3, dim_out=3)
NN.add_layer('Output', dim_in=3, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.005)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:


# 8 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=8)
NN.add_layer('Hidden', dim_in=8, dim_out=8)
NN.add_layer('Hidden', dim_in=8, dim_out=8)
NN.add_layer('Output', dim_in=8, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.004)
plot_loss(loss)
plot_decision_regions(NN)


In [None]:

# 16 hidden units
L_RATE = 0.002
NN = MLP()
NN.add_layer('Hidden', dim_in=2, dim_out=16)
NN.add_layer('Hidden', dim_in=16, dim_out=16)
NN.add_layer('Hidden', dim_in=16, dim_out=16)
NN.add_layer('Output', dim_in=16, dim_out=3)
NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = NN.train(X, y, epochs=100, bsize=8, alpha=0.004)
plot_loss(loss)
plot_decision_regions(NN)

# Testing network on Iris dataset

Features: Sepal Length, Sepal Width, and Petal Length

CLasses: (3 types of petal) Setosa, Versicolour, and Virginica

Link: https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html


In [None]:
def accuracy(model, X_test, y_test):
    predictions = model.predict_in(X_test).squeeze()
    
    # src: https://stackoverflow.com/questions/20402109/calculating-percentage-error-by-comparing-two-arrays
    error = np.mean( predictions != y_test )
    return 1 - error

In [None]:
from sklearn.model_selection import train_test_split

# Load Data

iris = datasets.load_iris()
X = iris.data[:, :4]  # First 3 features
y = iris.target

# Train Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)


# Train Validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

In [None]:
# 3 hidden units
L_RATE = 0.0001
Custom_NN = MLP()
Custom_NN.add_layer('Hidden', dim_in=4, dim_out=16)
Custom_NN.add_layer('Hidden', dim_in=16, dim_out=16)
Custom_NN.add_layer('Hidden', dim_in=16, dim_out=16)
Custom_NN.add_layer('Output', dim_in=16, dim_out=3)
Custom_NN.add_layer('Loss', dim_in=3, dim_out=3)

loss = Custom_NN.train_with_val(X_train, y_train, X_test, y_test, epochs=200, bsize=1, alpha=0.01, patience = 2)
plot_loss(loss)



In [None]:
print(Custom_NN.predict_in(X_test).squeeze())
print(y_test)
print("\n\nAccuracy of model on test data is: ",accuracy(Custom_NN, X_test, y_test))