In [194]:
from tensorflow import keras
from keras.datasets import fashion_mnist,mnist
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import math
import pprint
import wandb

In [2]:
wandb.login() 

wandb: Currently logged in as: as1_dl (use `wandb login --relogin` to force relogin)


True

In [199]:
def batch_split(batch_size,X_train,Y_train,X_test,Y_test): #Split the data into batches
    x_train, y_train_cat = shuffle(X_train,Y_train)
    x_test,y_test=shuffle(X_test,Y_test)
    x_train = x_train.reshape(x_train.shape[0],784)
    x_test = x_test.reshape(x_test.shape[0],784)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train_cat, test_size=0.1)
    x_train=x_train/np.max(x_train)
    x_train_raw=x_train
    x_test=x_test/np.max(x_test)
    x_valid=x_valid/np.max(x_valid)
    #y_train=y_train.reshape(len(y_train), 1)
    enc=OneHotEncoder(sparse=False)
    y_train_enc = enc.fit_transform(np.array(y_train.reshape(len(y_train), 1)))
    y_test_enc = enc.fit_transform(np.array(y_test.reshape(len(y_test), 1)))
    y_valid_enc = enc.fit_transform(np.array(y_valid.reshape(len(y_valid), 1)))
    y_train_bs=[]
    x_train_bs=[]
    if x_train.shape[0]%batch_size==0: #When all the batch sizes can be equal
        x_train_bs=np.vsplit(x_train,int(x_train.shape[0]/batch_size))
        y_train_bs=np.vsplit(y_train_enc,int(x_train.shape[0]/batch_size))
    else: #When all the batch sizes except one are be equal
        x_train_bs=np.vsplit(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size],math.floor(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size].shape[0]/batch_size))
        x_train_bs.append(x_train[x_train.shape[0]-x_train.shape[0]%batch_size:x_train.shape[0]])
        y_train_bs=np.vsplit(y_train_enc[0:x_train.shape[0]-x_train.shape[0]%batch_size],math.floor(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size].shape[0]/batch_size))
        y_train_bs.append(y_train_enc[x_train.shape[0]-x_train.shape[0]%batch_size:x_train.shape[0]])
    return x_train_raw,x_train_bs,y_train,y_train_bs,y_train_enc,x_test,y_test,y_test_enc,x_valid,y_valid,y_valid_enc

def activ_func(x,act_func):  #activation function 
    if act_func=='logistic':
        log_func =1 / (1 + np.exp(-x))
        return log_func
    elif act_func=='tanh':
        return np.tanh(x)
    elif act_func=='relu':
        return np.maximum(x, 0)
    
def d_activ_func(x,act_func): #derivative of the activation function
    if act_func=='logistic':
        return activ_func(x,act_func)*(1-activ_func(x,act_func))
    elif act_func=='tanh':
        return 1-np.square(activ_func(x,act_func))
    elif act_func=='relu':
        return (x > 0) * 1

def func_softmax(x): #output function as softmax
    sf=[]
    e_x = np.exp(x)
    for i in range(x.shape[1]):
        sf.append(e_x[:,i]/np.sum(e_x[:,i]))
    return sf

def compute_loss(y_predicted,y,w,loss_func,lam): 
    samp=len(y)
    loss=0
    loss_w=0 #loss from the regularisation
    for i in range(len(w)):
        loss_w=loss_w+(lam*(np.sum(np.square(w[i])))/(2*samp))
    if loss_func=='cross_entropy':
        for i in range(samp):
            loss=loss+np.divide(-np.dot(np.log(y_predicted[i]),np.transpose(y[i])),samp)
    elif loss_func=='sq_error':    
        for i in range(samp):
            loss=loss+np.divide(np.sum(np.square(y_predicted[i]-y[i])),samp)
    return loss+loss_w

def initialize_params(no_hidden,size_hidden,spec): #initialises weights and biases
    mf=1e-2 #using this multiplication factor gives a much better performance. standard random weights are just too damn big
    input_size=784
    no_output=10
    if spec=='xv':    #xavier initialisation
        sd = 6/(size_hidden[0]+input_size)
        w=[]
        b=[]       
        w.append(np.random.uniform(low = -sd ,high = sd, size =(size_hidden[0], input_size)))
        b.append(np.random.uniform(low = -sd ,high = sd, size = (size_hidden[0],1)))
        for i in range(no_hidden-1):
            sd = 6/(size_hidden[i+1]+size_hidden[i]    
            w.append(np.random.uniform(low = -sd ,high = sd, size =(size_hidden[i+1],size_hidden[i])))
            b.append(np.random.uniform(low = -sd ,high = sd, size = (size_hidden[i+1],1)))
        sd = 6/(size_hidden[no_hidden]+no_output)    
        w.append(np.random.uniform(low = -sd ,high = sd, size =(no_output,size_hidden[-1])))
        b.append(np.random.uniform(low = -sd ,high = sd, size = (no_output,1)))
        return w,b
    
    elif spec=='rand':    #random initialisation
        w=[]
        w.append(mf*np.random.rand(size_hidden[0],input_size))
        b=[]
        b.append(mf*np.random.rand(size_hidden[0],1))
        for i in range(no_hidden-1):
            w.append(mf*np.random.rand(size_hidden[i+1],size_hidden[i]))
            b.append(mf*np.random.rand(size_hidden[i+1],1)) 
        w.append(mf*np.random.rand(no_output,size_hidden[-1]))
        b.append(mf*np.random.rand(no_output,1))
        return w,b
    
def forward_prop(x,w,b,act_func,no_hidden): #forward propogation
    a=[] #pre activation
    h=[] #activation
    h.append(x.T)
    for i in range(no_hidden):
        a.append(np.dot(w[i],h[i])+b[i])
        h.append(activ_func(a[i],act_func))
    a_f=np.dot(w[no_hidden],h[no_hidden])+b[no_hidden]
    y_pred=func_softmax(a_f)
    return a,h,a_f,y_pred

def backward_prop(y,y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func): #back propogation
    grad_a=[None]*(no_hidden+1)#gradient of the pre-activation function 
    grad_w=[None]*(no_hidden+1)#gradient of the weights
    grad_b=[None]*(no_hidden+1)#gradient of the biases
    grad_h=[None]*(no_hidden)# gradient of the activation function
    num=len(y)
    num2=y.shape[0]
    if loss_func=='cross_entropy':
        grad_a[no_hidden]=-(y-y_pred)
    elif loss_func=='sq_error':
        grad=np.zeros((num,10))
        ydot=np.empty(num)
        for i in range(num2):
            ydot[i]=np.dot(y[i],y_pred[i])
            #ydot[i]=y[i]*y_pred[i]
            grad[i]=(y_pred[i]-y[i])*ydot[i]*(y[i]-y_pred[i])
        #grad_a[no_hidden]=(y-y_pred)*y_pred*(1-y_pred)
        grad_a[no_hidden]=grad
    for k in range(no_hidden,-1,-1):
        grad_w[k]=np.divide(np.dot(h[k],grad_a[k]),num)+np.divide((lam*w[k].T),num)#gradient of the weights
        grad_b[k]= np.divide(np.sum(grad_a[k], axis=0, keepdims=True),num) #gradient of the biases
        if k >= 1:
            grad_h[k-1]=np.dot(grad_a[k],w[k])
            grad_a[k-1]=grad_h[k-1]*d_activ_func(a[k-1].T,act_func)
    return grad_b,grad_w

def sg_mb_update(w,b,grad_w,grad_b,learning_rate,no_hidden): #minibatch gradient descent
    for i in range(no_hidden+1):
        w[i]=w[i]-(learning_rate*grad_w[i].T)
        b[i]=b[i]-(learning_rate*grad_b[i].T)
    return w,b

def momentum_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden): #momentum based graddient descent
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    gamma=0.9
    for i in range(no_hidden+1):
        sum_w[i]=gamma*update_w[i]+learning_rate*grad_w[i].T
        sum_b[i]=gamma*update_b[i]+learning_rate*grad_b[i].T
        w[i]=w[i]-sum_w[i]
        b[i]=b[i]-sum_b[i]
    update_w=sum_w
    update_b=sum_b
    return w,b,update_w,update_b

def nag_lookahead(w,b,learning_rate,update_w,update_b,no_hidden): #estimate the gradients of the lookahead point in NAG
    for i in range(no_hidden+1):
        w[i]=w[i]-learning_rate*update_w[i]
        b[i]=b[i]-learning_rate*update_b[i]
    return w,b

def nag_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden): #final gradient update in NAG
    gamma=0.9
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w[i]=gamma*update_w[i]+learning_rate*grad_w[i].T
        sum_b[i]=gamma*update_b[i]+learning_rate*grad_b[i].T
        w[i]=w[i]-sum_w[i]
        b[i]=b[i]-sum_b[i]
    update_w=sum_w
    update_b=sum_b
    return w,b,update_w,update_b

def rmsprop_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,no_hidden): #RMSprop
    beta=0.9
    epsilon=1e-8
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w[i]=beta*v_w[i]+(1-beta)*(np.square(grad_w[i].T))
        sum_b[i]=beta*v_b[i]+(1-beta)*(np.square(grad_b[i].T))
        w[i]=w[i]-learning_rate*np.divide(grad_w[i].T,np.sqrt(sum_w[i]+epsilon))
        b[i]=b[i]-learning_rate*np.divide(grad_b[i].T,np.sqrt(sum_b[i]+epsilon))
    v_w=sum_w
    v_b=sum_b
    return w,b,v_w,v_b

def adam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden): #Adam optimisation
    beta_1=0.9
    beta_2=0.99
    epsilon=1e-8
    sum_w_m=[i * 0 for i in w]
    sum_w_v=[i * 0 for i in w]
    sum_b_m=[i * 0 for i in b]
    sum_b_v=[i * 0 for i in b]
    m_cap_w=[i * 0 for i in w]
    m_cap_b=[i * 0 for i in b]
    v_cap_w=[i * 0 for i in w]
    v_cap_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w_m[i]=beta_1*m_w[i]+(1-beta_1)*grad_w[i].T
        sum_w_v[i]=beta_2*v_w[i]+(1-beta_2)*(np.square(grad_w[i].T))

        sum_b_m[i]=beta_1*m_b[i]+(1-beta_1)*grad_b[i].T
        sum_b_v[i]=beta_2*v_b[i]+(1-beta_2)*(np.square(grad_b[i].T))

        m_cap_w[i]=np.divide(sum_w_m[i],(1-math.pow(beta_1,ct)))
        v_cap_w[i]=np.divide(sum_w_v[i],(1-math.pow(beta_2,ct)))

        m_cap_b=np.divide(sum_b_m[i],(1-math.pow(beta_1,ct)))
        v_cap_b=np.divide(sum_b_v[i],(1-math.pow(beta_2,ct)))

        w[i]=w[i]-(learning_rate*np.divide(m_cap_w[i],np.sqrt(v_cap_w[i]+epsilon)))
        b[i]=b[i]-(learning_rate*np.divide(m_cap_b[i],np.sqrt(v_cap_b[i]+epsilon)))
    m_w=sum_w_m
    m_b=sum_b_m
    v_w=sum_w_v
    v_b=sum_b_v
    return w,b,v_w,v_b,m_w,m_b
def nadam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden): #nadam optimisation
    beta_1=0.9
    beta_2=0.99
    epsilon=1e-8
    sum_w_m=[i * 0 for i in w]
    sum_w_v=[i * 0 for i in w]
    sum_b_m=[i * 0 for i in b]
    sum_b_v=[i * 0 for i in b]
    m_cap_w=[i * 0 for i in w]
    m_cap_b=[i * 0 for i in b]
    v_cap_w=[i * 0 for i in w]
    v_cap_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w_m[i]=beta_1*m_w[i]+(1-beta_1)*grad_w[i].T
        sum_w_v[i]=beta_2*v_w[i]+(1-beta_2)*(np.square(grad_w[i].T))

        sum_b_m[i]=beta_1*m_b[i]+(1-beta_1)*grad_b[i].T
        sum_b_v[i]=beta_2*v_b[i]+(1-beta_2)*(np.square(grad_b[i].T))

        m_cap_w[i]=np.divide(beta_1*sum_w_m[i],(1-math.pow(beta_1,ct)))+np.divide((1-beta_1)*grad_w[i].T,(1-math.pow(beta_1,ct)))
        v_cap_w[i]=np.divide(sum_w_v[i],(1-math.pow(beta_2,ct)))

        m_cap_b[i]=np.divide(beta_1*sum_b_m[i],(1-math.pow(beta_1,ct)))+np.divide((1-beta_1)*grad_b[i].T,(1-math.pow(beta_1,ct)))
        v_cap_b[i]=np.divide(sum_b_v[i],(1-math.pow(beta_2,ct)))

        w[i]=w[i]-(learning_rate*np.divide(m_cap_w[i],np.sqrt(v_cap_w[i]+epsilon)))
        b[i]=b[i]-(learning_rate*np.divide(m_cap_b[i],np.sqrt(v_cap_b[i]+epsilon)))
    m_w=sum_w_m
    m_b=sum_b_m
    v_w=sum_w_v
    v_b=sum_b_v
    return w,b,v_w,v_b,m_w,m_b

def test_model(w,b,x_test,y_test,y_test_enc,act_func,no_hidden,loss_func,lam):  #test the parameters onn a given dataset
    an=[]
    hn=[]
    hn.append(x_test.T)
    for i in range(no_hidden):
        an.append(np.dot(w[i],hn[i])+b[i])
        hn.append(activ_func(an[i],act_func))
    a_fn=np.dot(w[no_hidden],hn[no_hidden])+b[no_hidden]
    y_pred1=func_softmax(a_fn)
    y_final=np.empty(x_test.shape[0])
    for i in range(x_test.shape[0]):
        y_final[i]=y_pred1[i].argmax()
    loss= compute_loss(y_pred1,y_test_enc,w,loss_func,lam)
    return round(accuracy_score(y_test, y_final),4),round(loss,4)

def train_model(no_hidden,size_hidden,bs,max_iterations,learning_rate,learn_algo,lam,spec,act_func,loss_func): #execute this to train your data
#no_hidden- Number of hidden layers,size_hidden - Array containing size of each layer
#bs- batch size,max_iterations-Number of epochs,learning_rate-learning rate,
#learn_algo- Optimisation algorithm ('sg'-stochastic gradient descent and minibatch gradient descent(specify sizes appropriately)
#'mb'-mini_batch gradient descent,'nag'-NAG, 'rmsprop'-RMSProp,'adam'-ADAM, 'nadam'-NADAm
#lam-L-2 regularisation parameter,spec-weights and biases initialisation(xv-Xavier and 'random'-Random),
#act_func- activation function ('logistic'-sigmoid function, 'tanh'-tanh function, 'relu'-Relu function)
    ct=1;
    #(X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data() #load the data
    (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
    input_size=784 #input of 784 pixels
    no_output=10 #10 classes
    x_train_raw,x_train,y_train_raw,y_train,y_train_enc,x_test,y_test,y_test_enc,x_valid,y_valid,y_valid_enc=batch_split(bs,X_train,Y_train,X_test,Y_test) #split the data to training, test and validation
    no_batch=len(x_train)
    w,b=initialize_params(no_hidden,size_hidden,spec)
    
    if learn_algo=='sg':
        while ct<=max_iterations:
            for eg in range(no_batch):
                a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                w,b=sg_mb_update(w,b,grad_w,grad_b,learning_rate,no_hidden)
            valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
            train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
            #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
            #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
            #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
            ct+=1        
    elif learn_algo=='mb':
           
            update_w= [i * 0 for i in w]
            update_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                    w,b,update_w,update_b=momentum_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='nag':  
            update_w= [i * 0 for i in w]
            update_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    w,b=nag_lookahead(w,b,learning_rate,update_w,update_b,no_hidden)
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                    w,b,update_w,update_b=nag_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='rmsprop':    
            v_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                    w,b,v_w,v_b=rmsprop_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='adam':    
            v_w= [i * 0 for i in w]
            m_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            m_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                    w,b,v_w,v_b,m_w,m_b=adam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1            
    elif learn_algo=='nadam':    
            v_w= [i * 0 for i in w]
            m_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            m_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,loss_func)
                    w,b,v_w,v_b,m_w,m_b=nadam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                #wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc})
                ct+=1
    test_acc,test_loss=test_model(w,b,x_test,y_test,y_test_enc,act_func,no_hidden,loss_func,lam)
    wandb.log({"test accuracy":test_acc})
    #test_model_cf(w,b,x_test,y_test,y_test_enc,act_func,no_hidden,loss_func,lam)
    #return w,b
    #print('test_loss',test_loss,'test_Accuracy',test_acc)
    

In [200]:
sweep_config={'method':'random'} #set the type of search

In [201]:
metric={'name':'valid_acc','goal':'maximize'}
sweep_config['metric']=metric #set the metric

In [202]:
parameters_dict={'no_hidden':{'values':[3,4]},'size_hidden':{'values':[32,64,128]},'max_iterations':{'values':[10]},
    'bs':{'values':[32,64]},'learn_algo':{'values':['nadam','rmsprop','adam']}, 'lam':{'values':[0.005,0.0005]}, 'learning_rate':{'values':[0.001,0.001]},
                    'spec':{'values':['rand']},'act_func':{'values':['tanh']},'loss_func':{'values':['cross_entropy']}} #all parameters neede for the sweep

In [203]:
sweep_config['parameters']=parameters_dict #add to the sweep config

In [198]:
sweep_id=wandb.sweep(sweep_config,project='mnist') #create sweep id and pass it to the agent

Create sweep with ID: b0nwo1sq
Sweep URL: https://wandb.ai/as1_dl/mnist/sweeps/b0nwo1sq


In [205]:
def train2(config=None): #function that is used by wandb agent to call the training model and perform the sweep
    with wandb.init(config=config):  # this gets over-written in the Sweep
        size_hidden=[]
        config = wandb.config
        no_hidden=config.no_hidden
        #no_hidden=4
        for i in range(no_hidden): #loop to initialise each layer with the same number of neurons. This is done only for the sweep. Normally different neurons can also be precribed
            size_hidden.append(config.size_hidden)
    
        max_iterations=config.max_iterations 
        bs=config.bs
        learning_rate=config.learning_rate
        learn_algo=config.learn_algo
        lam=config.lam
        spec=config.spec
        act_func=config.act_func
        loss_func=config.loss_func
        train_model(no_hidden,size_hidden,bs,max_iterations,learning_rate,learn_algo,lam,spec,act_func,loss_func) #obtains the cross validation accuracy score
        #wandb.log({"Accuracy":acc}) #stores the score for each parameter search

wandb.agent(sweep_id, function=train2,count=20)

wandb: Agent Starting Run: f7ilk1w1 with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▆▇▇██████
test accuracy,▁
val_accuracy,▁▆▇▇██████

0,1
accuracy,0.9745
test accuracy,0.9567
val_accuracy,0.9588


wandb: Agent Starting Run: ud203tmu with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▅▆▇▇▇███
test accuracy,▁
val_accuracy,▁▄▆▇▇█████

0,1
accuracy,0.9856
test accuracy,0.97
val_accuracy,0.9638


wandb: Agent Starting Run: cwwvvhpa with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▆▆▇█████
test accuracy,▁
val_accuracy,▁▅▆▆▇█████

0,1
accuracy,0.9869
test accuracy,0.9699
val_accuracy,0.9718


wandb: Agent Starting Run: 2or9tr7z with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▆▇███████
test accuracy,▁
val_accuracy,▁▆▇███████

0,1
accuracy,0.9659
test accuracy,0.9505
val_accuracy,0.947


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 3igpi7ue with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▅▇▇▇▇████
test accuracy,▁
val_accuracy,▁▆▆▇▇█████

0,1
accuracy,0.9735
test accuracy,0.9571
val_accuracy,0.9578


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: qypd2j2f with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▅▆▇▇▇████
test accuracy,▁
val_accuracy,▁▆▇▇▇▇████

0,1
accuracy,0.9788
test accuracy,0.9629
val_accuracy,0.9645


wandb: Agent Starting Run: xluu38ef with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9608


wandb: Agent Starting Run: 7x2jo5xn with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9711


wandb: Agent Starting Run: 5glqqryr with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.969


wandb: Agent Starting Run: usqp9uha with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9578


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: gbm3wkxy with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9547


wandb: Agent Starting Run: 9xovj5pr with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: rmsprop
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9595


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: slyy8p0d with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9583


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: b280haw9 with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9695


wandb: Agent Starting Run: pwu1qrtw with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9723


wandb: Agent Starting Run: 9cuoa0xz with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9485


wandb: Agent Starting Run: jfd3db4z with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9534


wandb: Agent Starting Run: p1b0v0cn with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.005
wandb: 	learn_algo: adam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 128
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test accuracy,▁

0,1
test accuracy,0.9703


wandb: Agent Starting Run: w7j7n9du with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.0005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▆▇▇▇█████
test accuracy,▁
val_accuracy,▁▆▇▇██████

0,1
accuracy,0.9766
test accuracy,0.9591
val_accuracy,0.9582


wandb: Agent Starting Run: bk0nsf0u with config:
wandb: 	act_func: tanh
wandb: 	bs: 32
wandb: 	lam: 0.0005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 64
wandb: 	spec: rand


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▅▆▇▇▇████
test accuracy,▁
val_accuracy,▁▅▇▇▇█████

0,1
accuracy,0.9867
test accuracy,0.9684
val_accuracy,0.9685


In [114]:
def test_model_cf(w,b,x_test,y_test,y_test_enc,act_func,no_hidden,loss_func,lam):  #obtain the confusion matrix
    wandb.init(project='confusion matrix')
    an=[]
    hn=[]
    hn.append(x_test.T)
    class_name=['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']
    for i in range(no_hidden):
        an.append(np.dot(w[i],hn[i])+b[i])
        hn.append(activ_func(an[i],act_func))
    a_fn=np.dot(w[no_hidden],hn[no_hidden])+b[no_hidden]
    y_pred1=func_softmax(a_fn)
    y_final=np.empty(x_test.shape[0])
    for i in range(x_test.shape[0]):
        y_final[i]=y_pred1[i].argmax()
    #loss= compute_loss(y_pred1,y_test_enc,w,loss_func,lam)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,preds=y_final,y_true=y_test,class_names=class_name)})