<a href="https://colab.research.google.com/github/Aggraj/Deep-Learning-CS-6910/blob/main/Hyperparameter_sweep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
from sklearn.metrics import mean_squared_error

class Feedforwardneuralnetwork:

    def __init__(self,n_inputs,n_hidden,n_outputs,activation,loss_function):
        self.loss_function = loss_function   
        self.n_inputs   = n_inputs
        self.n_outputs  = n_outputs
        self.n_hidden   = n_hidden
        self.activation = activation
        self.weights    = []
        self.biases     = []


        layers = [self.n_inputs] + self.n_hidden + [self.n_outputs]
        for i in range(len(n_hidden)+1):
         #  self.weights.append(np.random.randn(layers[i+1],layers[i]))
            self.weights = [np.random.randn(y, x)* 2/np.sqrt(x+y) for x,y in zip(layers[:-1], layers[1:])]
            self.biases.append(np.random.randn(layers[i+1],1))



    def sigmoid(self,x):
        return 1 / ( 1 + np.exp(-x))

    def tanh(self,x):
	      return np.tanh(x)
       
    def d_tanh(self,x):
        return (1 - (np.tanh(x))**2)
    
    def relu(self, x):
        return np.maximum(0, x, x)

    def leaky_relu(self,x):
       return np.where(x > 0, x, x * 0.01)
       
    def d_leaky_relu(self,x):
       return np.where(x > 0, 1, 1 * 0.01) 
        
    def d_relu(self,x):
       return np.greater(x,0).astype(int) 


    def softmax(self,x):
        soft = np.zeros(x.shape)
        for i in range(0, x.shape[1]):
            numr = np.exp(x[:, i])
            soft[:, i] = numr/np.sum(numr)
        return soft

    def forward_propagation(self,input):

        self.intermidiate_inputs = []
        self.post_outputs  = []
        W      = self.weights
        b      = self.biases

        k=0
        self.intermidiate_inputs.append(np.matmul(W[k],input)+b[k])
        if self.activation == 'sigmoid':
          self.post_outputs.append(self.sigmoid(self.intermidiate_inputs[k]))
        elif self.activation == 'tanh':
          self.post_outputs.append(self.tanh(self.intermidiate_inputs[k]))
        elif self.activation == 'relu': 
          self.post_outputs.append(self.relu(self.intermidiate_inputs[k])) 
        elif self.activation == 'leaky_relu': 
          self.post_outputs.append(self.leaky_relu(self.intermidiate_inputs[k]))

        for k in range(1,len(self.n_hidden)):
            self.intermidiate_inputs.append(np.matmul(W[k],self.post_outputs[k-1])+b[k])
            if self.activation == 'sigmoid':
              self.post_outputs.append(self.sigmoid(self.intermidiate_inputs[k]))
            elif self.activation == 'tanh':
              self.post_outputs.append(self.tanh(self.intermidiate_inputs[k]))
            elif self.activation == 'relu': 
              self.post_outputs.append(self.relu(self.intermidiate_inputs[k])) 
            elif self.activation == 'leaky_relu': 
              self.post_outputs.append(self.leaky_relu(self.intermidiate_inputs[k]))  


        k=len(self.n_hidden)
        self.intermidiate_inputs.append(np.matmul(W[k],self.post_outputs[k-1])+b[k])
        self.post_outputs.append(self.softmax(self.intermidiate_inputs[k]))

        return self.post_outputs[-1]

    def back_propagation(self,train_images,train_labels):

        g_weights = [0]*(len(self.weights))
        g_biases  = [0]*(len(self.biases))
        g_a       = [0]*(len(self.n_hidden)+1)
        g_h       = [0]*(len(self.n_hidden)+1)
        n_samples = train_images.shape[0]  # Change depending on the dimensions of data


        for k in reversed(range(len(self.n_hidden)+1)):
            if k == len(self.n_hidden):
              if self.loss_function == 'cross_entropy':
                  g_a[k] = self.post_outputs[k]  - train_labels  # keep or remove T depending on the dimensions of data
              elif self.loss_function == 'square_loss': 
                  g_a[k] = (self.post_outputs[k] - train_labels) * self.post_outputs[k] * (1 - self.post_outputs[k]) 
                
            else:
                g_h[k] = (1/n_samples)*np.matmul(self.weights[k+1].T,g_a[k+1])
                if self.activation == 'sigmoid':
                  g_a[k] = (1/n_samples)*np.multiply(g_h[k],np.multiply(self.sigmoid(self.intermidiate_inputs[k]),(1-self.sigmoid(self.intermidiate_inputs[k]))))
                elif self.activation == 'tanh':
                  g_a[k] = (1/n_samples)*np.multiply(g_h[k],self.d_tanh(self.intermidiate_inputs[k]))
                elif self.activation == 'relu':
                  g_a[k] = (1/n_samples)*np.multiply(g_h[k],self.d_relu(self.intermidiate_inputs[k]))
                elif self.activation == 'leaky_relu':
                  g_a[k] = (1/n_samples)*np.multiply(g_h[k],self.d_leaky_relu(self.intermidiate_inputs[k]))

            if k == 0:
                g_weights[k] = (1/n_samples)*np.matmul(g_a[k],train_images.T) 
            else:
                g_weights[k] = (1/n_samples)*np.matmul(g_a[k],self.post_outputs[k-1].T)

            g_biases[k]  = (1/n_samples)*np.sum(g_a[k], axis=1, keepdims = True)
        return g_weights,g_biases



    def train_model(self,train_images,train_labels,train_val_images,train_val_labels,epochs,learning_rate,opt='gd',batch_size = 32,lambd=0.0005): 
      steps = 0
      pre_delta_w = np.multiply(self.weights,0)
      pre_delta_b = np.multiply(self.biases,0)
      delta_w = np.multiply(self.weights,0)
      delta_b = np.multiply(self.biases,0)
      vw = 0.0
      vb = 0.0
      eps = 1e-8
      lr_w = 0.0
      lr_b = 0.0
      gamma = 0.9
      beta = 0.999
      beta1 = 0.9
      beta2 = 0.999
      m_t, v_t, m_hat_w, v_hat_w, m_b,v_b,m_hat_b,v_hat_b = 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 
      mu1,mu2,mu3,mu4,mu5 = 0.0,0.0,0.0,0.0,0.0
      for i in range(epochs+1):
        

            for bb in range(0, train_images.shape[1], batch_size):

              train_b_imag = train_images[:,bb:bb+batch_size]
              train_l_imag = train_labels[:,bb:bb+batch_size]
              output =  self.forward_propagation(train_b_imag)
              g_weights,g_biases = self.back_propagation(train_b_imag,train_l_imag)
              if opt == 'gd':
                 delta_w = np.multiply(learning_rate,g_weights)
                 delta_b = np.multiply(learning_rate,g_biases)
          
              if opt == 'mgd':
                 delta_w = np.multiply(gamma,pre_delta_w) + np.multiply(learning_rate,g_weights)
                 delta_b = np.multiply(gamma,pre_delta_b) + np.multiply(learning_rate,g_biases)
                 pre_delta_w = delta_w
                 pre_delta_b = delta_b

              if opt == 'ngd':
                 self.weights = self.weights - np.multiply(gamma,pre_delta_w)
                 self.biases  = self.biases - np.multiply(gamma,pre_delta_b)
                #output =  self.forward_propagation(train_b_imag)
                 g_weights,g_biases = self.back_propagation(train_b_imag,train_l_imag)

                 delta_w = np.multiply(gamma,pre_delta_w) + np.multiply(learning_rate,g_weights)
                 delta_b = np.multiply(gamma,pre_delta_b) + np.multiply(learning_rate,g_biases)
                
                 pre_delta_w = delta_w
                 pre_delta_b = delta_b
                
              if opt == 'rmsprop': 
                 
                 vw = np.multiply(vw,beta) + np.multiply(1-beta,np.power(g_weights,2))
                 vb = np.multiply(vb,beta) + np.multiply(1-beta,np.power(g_biases,2))
                 lr_w = learning_rate/np.power(vw+eps,1/2)
                 lr_b = learning_rate/np.power(vb+eps,1/2)
          
                 delta_w = np.multiply(g_weights,lr_w)
                 delta_b = np.multiply(g_biases,lr_b)
              
              if opt == 'adam':
                 m_t = np.multiply(beta1,m_t) + np.multiply(1-beta1,g_weights)
                 v_t = np.multiply(beta2,v_t) + np.multiply(1-beta2,np.power(g_weights,2))
                 m_b = np.multiply(beta1,m_b) + np.multiply(1-beta1,g_biases)
                 v_b = np.multiply(beta2,v_b) + np.multiply(1-beta2,np.power(g_biases,2))
                
                 m_hat_w = m_t/(1 - np.power(beta1,i+1))
                 m_hat_b = m_b/(1 - np.power(beta1,i+1))
                
                 v_hat_w = v_t/(1 - np.power(beta2,i+1))
                 v_hat_b = v_b/(1 - np.power(beta2,i+1))
                 delta_w = (learning_rate / np.power(v_hat_w + eps, 1/2)) * m_hat_w
                 delta_b = (learning_rate / np.power(v_hat_b + eps, 1/2)) * m_hat_b
              
              if opt == 'nadam':
                 
                 self.weights = self.weights - np.multiply(gamma,delta_w)
                 self.biases  = self.biases  - np.multiply(gamma,delta_b)
                #output =  self.forward_propagation(train_b_imag)
                 g_weights,g_biases = self.back_propagation(train_b_imag,train_l_imag)

                 m_t =  np.multiply(beta1,m_t) + np.multiply(1 - beta1,g_weights)
                 v_t =  np.multiply(beta2,v_t) + np.multiply(1 - beta2,np.power(g_weights, 2))

                 m_b =  np.multiply(beta1,m_b) + np.multiply(1 - beta1,g_biases)
                 v_b =  np.multiply(beta2,v_b) + np.multiply(1 - beta2,np.power(g_biases, 2))
                
                 m_hat_w = m_t / (1 - np.power(beta1, i+1)) 
                 v_hat_t = v_t / (1 - np.power(beta2, i+1))

                 m_hat_b = m_b / (1 - np.power(beta1, i+1)) 
                 v_hat_b = v_b / (1 - np.power(beta2, i+1))
  
                 mu1 = (1-beta1)/(1-np.power(beta,i+1))
                 mu2 = np.multiply(mu1,g_weights)
                 mu3 = np.multiply(mu1,g_biases)
                 
                 mu4 = np.multiply(beta1,m_hat_w)
                 mu5 = np.multiply(beta1,m_hat_b)

                 delta_w = np.multiply(learning_rate/(np.power(v_hat_t + eps,1/2)),(mu4 + mu2))
                 delta_b = np.multiply(learning_rate/(np.power(v_hat_b + eps,1/2)),(mu5 + mu3))

             
              self.weights = self.weights - delta_w - np.multiply(learning_rate*lambd,self.weights)
              self.biases  = self.biases  - delta_b 
                
            train_loss = -np.sum(np.multiply(train_l_imag,np.log(output)))/train_l_imag.shape[1]    
            #print('training_loss for epoch {} = {}'.format(i,train_loss))
            
            output = self.forward_propagation(train_images)
            out_class=(np.argmax(output,axis=0))
            target_class=(np.argmax(train_label,axis=1))
            acc1 = 100*np.sum(out_class==target_class)/output.shape[1]
            
            Validate = self.forward_propagation(train_val_images)
            out_class=(np.argmax(Validate,axis=0))
            target_class_validate=(np.argmax(train_val_labels,axis=1))
            acc2 = 100*np.sum(out_class==target_class_validate)/Validate.shape[1]
            if self.loss_function == 'cross_entropy':
              val_loss = -np.sum(np.multiply(train_val_labels.T,np.log(Validate)))/train_val_labels.shape[0]    
            elif self.loss_function == 'square_loss':
              val_loss = np.sum(mean_squared_error(train_val_labels.T, Validate))
              
 
            print('Epoch {}: training_accuracy = {:.2f}, Validation accuracy = {:.2f}'.format(i,acc1,acc2))


            wandb.log({"val_accuracy": acc2,"accuracy": acc1,"steps":epochs,"loss":train_loss,"val_loss":val_loss},)
         
      return acc1,acc2,train_loss,val_loss 

In [None]:
import tensorflow as tf
from keras.datasets import fashion_mnist
from keras.datasets import mnist

output_classes = 10
activation = 'relu'
loss_function = 'cross_entropy'
Mode  = Feedforwardneuralnetwork(28*28,[16,32],output_classes,activation,loss_function)
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
#(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

n_samples = train_images.shape[0]
train_images = train_images.reshape(n_samples,-1)
train_imag = train_images[:54000,:]
train_val_images = train_images[54000:,:]
labels = np.zeros((train_labels.shape[0],output_classes))
for i in range(train_labels.shape[0]):
  e = [0.0]*output_classes
  e[train_labels[i]] = 1.0
  labels[i] = e
train_label = labels[:54000,:]
train_val_labels = labels[54000:,:]
mean = train_imag.mean(axis=0)
std  = train_imag.std(axis = 0)
train_imag = (train_imag - mean)/255.0
train_val_images = (train_val_images - mean)/255.0
#epochs = 10
#learning_rate = 0.001
#(tr_loss) = Mode.train_model(train_imag.T,train_label.T,train_val_images.T,train_val_labels,epochs,learning_rate,'adam')

In [26]:
%pip install wandb -q
import wandb
wandb.login()

True

In [27]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'epochs': {
            'values': [5]
        },
        'learning_rate': {
            'values': [1e-2,1e-3]
        },
        'opt': {
            'values': ['nadam','rmsprop']
        },
        'activation': {
            'values': ['relu', 'tanh']
        },
        'n_hidden': {
            'values': [[16,32,64]]
        },
        'batch_size':{
            'values':[32]
        },
        'weight_decay':{
            'values':[0,0.0005]
        },
        'loss_function':{
            'values':['cross_entropy']
        }
    }
}


In [None]:
sweep_id = wandb.sweep(sweep_config, entity="chaxin", project="Assignment 1")

In [29]:
def train():
    steps = 0
    # Default values for hyper-parameters we're going to sweep over
    config_defaults = {
        'epochs': 2,
        'learning_rate': 1e-3,
        'hidden':[100,200],
        'learning_rate':1e-2,
        'opt':'ngd',
        'activation':'sigmoid',
        'n_inputs': 28*28,
        'n_outputs': 10,
        'batch_size':100,
        'weight_decay':0,
        'loss_function':'cross_entropy'
    }

    # Initialize a new wandb run
    wandb.init(project='Assignment 1', entity='chaxin',config=config_defaults)
    
    
    # Config is a variable that holds and saves hyperparameters and inputs
    config = wandb.config
    learning_rate = config.learning_rate
    epochs = config.epochs
    n_hidden = config.hidden
    activation = config.activation
    opt = config.opt
    n_inputs = config.n_inputs
    n_outputs = config.n_outputs
    batch_size = config.batch_size
    weight_decay = config.weight_decay
    loss_function = config.loss_function
    # Model training here
    sweep_network    = Feedforwardneuralnetwork(n_inputs, n_hidden, n_outputs,activation,loss_function)
    acc1,acc2,train_loss,val_loss  = sweep_network.train_model(train_imag.T,train_label.T,train_val_images.T,train_val_labels,epochs,learning_rate,opt,batch_size,weight_decay)

#train_network(network, dataset, config.learning_rate, config.epochs, n_outputs)



# 3. Log metrics over time to visualize performance


In [30]:
wandb.agent(sweep_id, train)

In [23]:
from keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
