In [1]:
from tensorflow import keras
from keras.datasets import fashion_mnist
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import math
import pprint
import wandb

In [2]:
wandb.login() 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: as1_dl (use `wandb login --relogin` to force relogin)


True

In [3]:
def batch_split(batch_size,X_train,Y_train,X_test,Y_test): #Split the data into batches
    x_train, y_train_cat = shuffle(X_train,Y_train)
    x_test,y_test=shuffle(X_test,Y_test)
    x_train = x_train.reshape(x_train.shape[0],784)
    x_test = x_test.reshape(x_test.shape[0],784)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train_cat, test_size=0.1)
    x_train=x_train/np.max(x_train)
    x_train_raw=x_train
    x_test=x_test/np.max(x_test)
    x_valid=x_valid/np.max(x_valid)
    #y_train=y_train.reshape(len(y_train), 1)
    enc=OneHotEncoder(sparse=False)
    y_train_enc = enc.fit_transform(np.array(y_train.reshape(len(y_train), 1)))
    y_test_enc = enc.fit_transform(np.array(y_test.reshape(len(y_test), 1)))
    y_valid_enc = enc.fit_transform(np.array(y_valid.reshape(len(y_valid), 1)))
    y_train_bs=[]
    x_train_bs=[]
    if x_train.shape[0]%batch_size==0: #When all the batch sizes can be equal
        x_train_bs=np.vsplit(x_train,int(x_train.shape[0]/batch_size))
        y_train_bs=np.vsplit(y_train_enc,int(x_train.shape[0]/batch_size))
    else: #When all the batch sizes except one are be equal
        x_train_bs=np.vsplit(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size],math.floor(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size].shape[0]/batch_size))
        x_train_bs.append(x_train[x_train.shape[0]-x_train.shape[0]%batch_size:x_train.shape[0]])
        y_train_bs=np.vsplit(y_train_enc[0:x_train.shape[0]-x_train.shape[0]%batch_size],math.floor(x_train[0:x_train.shape[0]-x_train.shape[0]%batch_size].shape[0]/batch_size))
        y_train_bs.append(y_train_enc[x_train.shape[0]-x_train.shape[0]%batch_size:x_train.shape[0]])
    return x_train_raw,x_train_bs,y_train,y_train_bs,y_train_enc,x_test,y_test,y_test_enc,x_valid,y_valid,y_valid_enc

def activ_func(x,act_func):  #activation function 
    if act_func=='logistic':
        log_func =1 / (1 + np.exp(-x))
        return log_func
    elif act_func=='tanh':
        return np.tanh(x)
    elif act_func=='relu':
        return np.maximum(x, 0)
    
def d_activ_func(x,act_func): #derivative of the activation function
    if act_func=='logistic':
        return activ_func(x,act_func)*(1-activ_func(x,act_func))
    elif act_func=='tanh':
        return 1-np.square(activ_func(x,act_func))
    elif act_func=='relu':
        return (x > 0) * 1

def func_softmax(x): #output function as softmax
    sf=[]
    e_x = np.exp(x)
    for i in range(x.shape[1]):
        sf.append(e_x[:,i]/np.sum(e_x[:,i]))
    return sf

def compute_loss(y_predicted,y,w,loss_func,lam): 
    samp=len(y)
    loss=0
    loss_w=0 #loss from the regularisation
    for i in range(len(w)):
        loss_w=loss_w+(lam*(np.sum(np.square(w[i])))/(2*samp))
    if loss_func=='cross_entropy':
        for i in range(samp):
            loss=loss+np.divide(-np.dot(np.log(y_predicted[i]),np.transpose(y[i])),samp)
    elif loss_func=='sq_error':    
        for i in range(samp):
            loss=loss+np.divide(np.sum(np.square(y_predicted[i]-y[i])),samp)
    return loss+loss_w

def initialize_params(no_hidden,size_hidden,spec): #initialises weights and biases
    mf=1e-2 #using this multiplication factor gives a much better performance. standard random weights are just too damn big
    input_size=784
    no_output=10
    if spec=='xv':    #xavier initialisation
        w=[]
        w.append(mf*np.sqrt(1./size_hidden[0])*np.random.rand(size_hidden[0],input_size))
        b=[]
        b.append(0*np.random.rand(size_hidden[0],1))
        for i in range(no_hidden-1):
            w.append(mf*np.sqrt(1./size_hidden[i+1])*np.random.rand(size_hidden[i+1],size_hidden[i]))
            b.append(0*np.random.rand(size_hidden[i+1],1)) 
        w.append(mf*np.sqrt(1./no_output)*np.random.rand(no_output,size_hidden[-1]))
        b.append(0*np.random.rand(no_output,1))
        return w,b
    
    elif spec=='rand':    #random initialisation
        w=[]
        w.append(mf*np.random.rand(size_hidden[0],input_size))
        b=[]
        b.append(mf*np.random.rand(size_hidden[0],1))
        for i in range(no_hidden-1):
            w.append(mf*np.random.rand(size_hidden[i+1],size_hidden[i]))
            b.append(mf*np.random.rand(size_hidden[i+1],1)) 
        w.append(mf*np.random.rand(no_output,size_hidden[-1]))
        b.append(mf*np.random.rand(no_output,1))
        return w,b
    
def forward_prop(x,w,b,act_func,no_hidden): #forward propogation
    a=[] #pre activation
    h=[] #activation
    h.append(x.T)
    for i in range(no_hidden):
        a.append(np.dot(w[i],h[i])+b[i])
        h.append(activ_func(a[i],act_func))
    a_f=np.dot(w[no_hidden],h[no_hidden])+b[no_hidden]
    y_pred=func_softmax(a_f)
    return a,h,a_f,y_pred

def backward_prop(y,y_pred,w,b,h,a,a_f,no_hidden,lam,act_func): #back propogation
    grad_a=[None]*(no_hidden+1)#gradient of the pre-activation function 
    grad_w=[None]*(no_hidden+1)#gradient of the weights
    grad_b=[None]*(no_hidden+1)#gradient of the biases
    grad_h=[None]*(no_hidden)# gradient of the activation function
    grad_a[no_hidden]=-(y-y_pred)
    num=len(y)
    for k in range(no_hidden,-1,-1):
        grad_w[k]=np.divide(np.dot(h[k],grad_a[k]),num)+np.divide((lam*w[k].T),num)#gradient of the weights
        grad_b[k]= np.divide(np.sum(grad_a[k], axis=0, keepdims=True),num) #gradient of the biases
        if k >= 1:
            grad_h[k-1]=np.dot(grad_a[k],w[k])
            grad_a[k-1]=grad_h[k-1]*d_activ_func(a[k-1].T,act_func)
    return grad_b,grad_w 

def sg_mb_update(w,b,grad_w,grad_b,learning_rate,no_hidden): #minibatch gradient descent
    for i in range(no_hidden+1):
        w[i]=w[i]-(learning_rate*grad_w[i].T)
        b[i]=b[i]-(learning_rate*grad_b[i].T)
    return w,b

def momentum_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden): #momentum based graddient descent
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    gamma=0.9
    for i in range(no_hidden+1):
        sum_w[i]=gamma*update_w[i]+learning_rate*grad_w[i].T
        sum_b[i]=gamma*update_b[i]+learning_rate*grad_b[i].T
        w[i]=w[i]-sum_w[i]
        b[i]=b[i]-sum_b[i]
    update_w=sum_w
    update_b=sum_b
    return w,b,update_w,update_b

def nag_lookahead(w,b,learning_rate,update_w,update_b,no_hidden): #estimate the gradients of the lookahead point in NAG
    for i in range(no_hidden+1):
        w[i]=w[i]-learning_rate*update_w[i]
        b[i]=b[i]-learning_rate*update_b[i]
    return w,b

def nag_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden): #final gradient update in NAG
    gamma=0.9
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w[i]=gamma*update_w[i]+learning_rate*grad_w[i].T
        sum_b[i]=gamma*update_b[i]+learning_rate*grad_b[i].T
        w[i]=w[i]-sum_w[i]
        b[i]=b[i]-sum_b[i]
    update_w=sum_w
    update_b=sum_b
    return w,b,update_w,update_b

def rmsprop_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,no_hidden): #RMSprop
    beta=0.9
    epsilon=1e-8
    sum_w=[i * 0 for i in w]
    sum_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w[i]=beta*v_w[i]+(1-beta)*(np.square(grad_w[i].T))
        sum_b[i]=beta*v_b[i]+(1-beta)*(np.square(grad_b[i].T))
        w[i]=w[i]-learning_rate*np.divide(grad_w[i].T,np.sqrt(sum_w[i]+epsilon))
        b[i]=b[i]-learning_rate*np.divide(grad_b[i].T,np.sqrt(sum_b[i]+epsilon))
    v_w=sum_w
    v_b=sum_b
    return w,b,v_w,v_b

def adam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden): #Adam optimisation
    beta_1=0.9
    beta_2=0.99
    epsilon=1e-8
    sum_w_m=[i * 0 for i in w]
    sum_w_v=[i * 0 for i in w]
    sum_b_m=[i * 0 for i in b]
    sum_b_v=[i * 0 for i in b]
    m_cap_w=[i * 0 for i in w]
    m_cap_b=[i * 0 for i in b]
    v_cap_w=[i * 0 for i in w]
    v_cap_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w_m[i]=beta_1*m_w[i]+(1-beta_1)*grad_w[i].T
        sum_w_v[i]=beta_2*v_w[i]+(1-beta_2)*(np.square(grad_w[i].T))

        sum_b_m[i]=beta_1*m_b[i]+(1-beta_1)*grad_b[i].T
        sum_b_v[i]=beta_2*v_b[i]+(1-beta_2)*(np.square(grad_b[i].T))

        m_cap_w[i]=np.divide(sum_w_m[i],(1-math.pow(beta_1,ct)))
        v_cap_w[i]=np.divide(sum_w_v[i],(1-math.pow(beta_2,ct)))

        m_cap_b=np.divide(sum_b_m[i],(1-math.pow(beta_1,ct)))
        v_cap_b=np.divide(sum_b_v[i],(1-math.pow(beta_2,ct)))

        w[i]=w[i]-(learning_rate*np.divide(m_cap_w[i],np.sqrt(v_cap_w[i]+epsilon)))
        b[i]=b[i]-(learning_rate*np.divide(m_cap_b[i],np.sqrt(v_cap_b[i]+epsilon)))
    m_w=sum_w_m
    m_b=sum_b_m
    v_w=sum_w_v
    v_b=sum_b_v
    return w,b,v_w,v_b,m_w,m_b
def nadam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden): #nadam optimisation
    beta_1=0.9
    beta_2=0.99
    epsilon=1e-8
    sum_w_m=[i * 0 for i in w]
    sum_w_v=[i * 0 for i in w]
    sum_b_m=[i * 0 for i in b]
    sum_b_v=[i * 0 for i in b]
    m_cap_w=[i * 0 for i in w]
    m_cap_b=[i * 0 for i in b]
    v_cap_w=[i * 0 for i in w]
    v_cap_b=[i * 0 for i in b]
    for i in range(no_hidden+1):
        sum_w_m[i]=beta_1*m_w[i]+(1-beta_1)*grad_w[i].T
        sum_w_v[i]=beta_2*v_w[i]+(1-beta_2)*(np.square(grad_w[i].T))

        sum_b_m[i]=beta_1*m_b[i]+(1-beta_1)*grad_b[i].T
        sum_b_v[i]=beta_2*v_b[i]+(1-beta_2)*(np.square(grad_b[i].T))

        m_cap_w[i]=np.divide(beta_1*sum_w_m[i],(1-math.pow(beta_1,ct)))+np.divide((1-beta_1)*grad_w[i].T,(1-math.pow(beta_1,ct)))
        v_cap_w[i]=np.divide(sum_w_v[i],(1-math.pow(beta_2,ct)))

        m_cap_b[i]=np.divide(beta_1*sum_b_m[i],(1-math.pow(beta_1,ct)))+np.divide((1-beta_1)*grad_b[i].T,(1-math.pow(beta_1,ct)))
        v_cap_b[i]=np.divide(sum_b_v[i],(1-math.pow(beta_2,ct)))

        w[i]=w[i]-(learning_rate*np.divide(m_cap_w[i],np.sqrt(v_cap_w[i]+epsilon)))
        b[i]=b[i]-(learning_rate*np.divide(m_cap_b[i],np.sqrt(v_cap_b[i]+epsilon)))
    m_w=sum_w_m
    m_b=sum_b_m
    v_w=sum_w_v
    v_b=sum_b_v
    return w,b,v_w,v_b,m_w,m_b

def test_model(w,b,x_test,y_test,y_test_enc,act_func,no_hidden,loss_func,lam):  #test the parameters onn a given dataset
    an=[]
    hn=[]
    hn.append(x_test.T)
    for i in range(no_hidden):
        an.append(np.dot(w[i],hn[i])+b[i])
        hn.append(activ_func(an[i],act_func))
    a_fn=np.dot(w[no_hidden],hn[no_hidden])+b[no_hidden]
    y_pred1=func_softmax(a_fn)
    y_final=np.empty(x_test.shape[0])
    for i in range(x_test.shape[0]):
        y_final[i]=y_pred1[i].argmax()
    loss= compute_loss(y_pred1,y_test_enc,w,loss_func,lam)
    return round(accuracy_score(y_test, y_final),4),round(loss,4)

def train_model(no_hidden,size_hidden,bs,max_iterations,learning_rate,learn_algo,lam,spec,act_func,loss_func): #execute this to train your data
#no_hidden- Number of hidden layers,size_hidden - Array containing size of each layer
#bs- batch size,max_iterations-Number of epochs,learning_rate-learning rate,
#learn_algo- Optimisation algorithm ('sg'-stochastic gradient descent and minibatch gradient descent(specify sizes appropriately)
#'mb'-mini_batch gradient descent,'nag'-NAG, 'rmsprop'-RMSProp,'adam'-ADAM, 'nadam'-NADAm
#lam-L-2 regularisation parameter,spec-weights and biases initialisation(xv-Xavier and 'random'-Random),
#act_func- activation function ('logistic'-sigmoid function, 'tanh'-tanh function, 'relu'-Relu function)
    ct=1;
    (X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data() #load the data
    input_size=784 #input of 784 pixels
    no_output=10 #10 classes
    x_train_raw,x_train,y_train_raw,y_train,y_train_enc,x_test,y_test,y_test_enc,x_valid,y_valid,y_valid_enc=batch_split(bs,X_train,Y_train,X_test,Y_test) #split the data to training, test and validation
    no_batch=len(x_train)
    w,b=initialize_params(no_hidden,size_hidden,spec)
    
    if learn_algo=='sg':
        while ct<=max_iterations:
            for eg in range(no_batch):
                a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func)
                w,b=sg_mb_update(w,b,grad_w,grad_b,learning_rate,no_hidden)
            valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
            train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
            wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
            #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
            #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
            ct+=1        
    elif learn_algo=='mb':
           
            update_w= [i * 0 for i in w]
            update_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func)
                    w,b,update_w,update_b=momentum_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='nag':  
            update_w= [i * 0 for i in w]
            update_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    w,b=nag_lookahead(w,b,learning_rate,update_w,update_b,no_hidden)
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func)
                    w,b,update_w,update_b=nag_update(w,b,grad_w,grad_b,learning_rate,update_w,update_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='rmsprop':    
            v_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func,)
                    w,b,v_w,v_b=rmsprop_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1
    elif learn_algo=='adam':    
            v_w= [i * 0 for i in w]
            m_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            m_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func)
                    w,b,v_w,v_b,m_w,m_b=adam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                ct+=1            
    elif learn_algo=='nadam':    
            v_w= [i * 0 for i in w]
            m_w= [i * 0 for i in w]
            v_b= [i * 0 for i in b]
            m_b= [i * 0 for i in b]
            while ct<=max_iterations:
                for eg in range(no_batch):
                    a,h,a_f,y_pred=forward_prop(x_train[eg],w,b,act_func,no_hidden)
                    grad_b,grad_w=backward_prop(y_train[eg],y_pred,w,b,h,a,a_f,no_hidden,lam,act_func)
                    w,b,v_w,v_b,m_w,m_b=nadam_update(w,b,grad_w,grad_b,learning_rate,v_w,v_b,m_w,m_b,ct,no_hidden)
                valid_acc,valid_loss=test_model(w,b,x_valid,y_valid,y_valid_enc,act_func,no_hidden,loss_func,lam)
                train_acc,train_loss=test_model(w,b,x_train_raw,y_train_raw,y_train_enc,act_func,no_hidden,loss_func,lam)
                #print('valid_loss',valid_loss,'Valid_Accuracy',valid_acc)
                #print('train_loss',train_loss,'train_Accuracy',train_acc,'\n')
                wandb.log({"accuracy":train_acc , "val_accuracy": valid_acc,"val_loss": valid_loss,"loss": train_loss,"epochs":ct})
                ct+=1
   
    

In [4]:
sweep_config={'method':'grid'} #set the type of search

In [5]:
metric={'name':'acc','goal':'maximize'}
sweep_config['metric']=metric #set the metric

In [6]:
parameters_dict={'no_hidden':{'values':[3,4]},'size_hidden':{'values':[32]},'max_iterations':{'values':[10]},
    'bs':{'values':[64]},'learning_rate':{'values':[0.001]},'learn_algo':{'values':['sg','nadam']},'lam':{'values':[0.0005]},
                    'spec':{'values':['rand']},'act_func':{'values':['tanh']},'loss_func':{'values':['cross_entropy']}} #all parameters neede for the sweep

In [7]:
sweep_config['parameters']=parameters_dict #add to the sweep config

In [8]:
pprint.pprint(sweep_config) #print config 

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'acc'},
 'parameters': {'act_func': {'values': ['tanh']},
                'bs': {'values': [64]},
                'lam': {'values': [0.0005]},
                'learn_algo': {'values': ['sg', 'nadam']},
                'learning_rate': {'values': [0.001]},
                'loss_func': {'values': ['cross_entropy']},
                'max_iterations': {'values': [10]},
                'no_hidden': {'values': [3, 4]},
                'size_hidden': {'values': [32]},
                'spec': {'values': ['rand']}}}


In [9]:
sweep_id=wandb.sweep(sweep_config,project='test-project') #create sweep id and pass it to the agent

Create sweep with ID: m7wuu851
Sweep URL: https://wandb.ai/as1_dl/test-project/sweeps/m7wuu851


In [10]:
def train2(config=None): #function that is used by wandb agent to call the training model and perform the sweep
    with wandb.init(config=config):  # this gets over-written in the Sweep
        size_hidden=[]
        config = wandb.config
        no_hidden=config.no_hidden
        #no_hidden=4
        for i in range(no_hidden): #loop to initialise each layer with the same number of neurons. This is done only for the sweep. Normally different neurons can also be precribed
            size_hidden.append(config.size_hidden)
    
        max_iterations=config.max_iterations 
        bs=config.bs
        learning_rate=config.learning_rate
        learn_algo=config.learn_algo
        lam=config.lam
        spec=config.spec
        act_func=config.act_func
        loss_func=config.loss_func
        train_model(no_hidden,size_hidden,bs,max_iterations,learning_rate,learn_algo,lam,spec,act_func,loss_func) #obtains the cross validation accuracy score
        #wandb.log({"Accuracy":acc}) #stores the score for each parameter search

wandb.agent(sweep_id, function=train2)

wandb: Agent Starting Run: hkvxyxxi with config:
wandb: 	act_func: tanh
wandb: 	bs: 64
wandb: 	lam: 0.0005
wandb: 	learn_algo: sg
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▁▁▁▁▁▁▁█▁
epochs,▁▂▃▃▄▅▆▆▇█
loss,██▆▆▆▆▃▃▁▁
val_accuracy,▂▂▂▂▂▂▂▂█▁
val_loss,████▅▅▅▁▁▁

0,1
accuracy,0.1006
epochs,10.0
loss,2.3022
val_accuracy,0.0942
val_loss,2.3023


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: lvglaj27 with config:
wandb: 	act_func: tanh
wandb: 	bs: 64
wandb: 	lam: 0.0005
wandb: 	learn_algo: sg
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 32
wandb: 	spec: rand
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epochs,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.1002
epochs,10.0
loss,2.3026
val_accuracy,0.0983
val_loss,2.3026


wandb: Agent Starting Run: swar74nk with config:
wandb: 	act_func: tanh
wandb: 	bs: 64
wandb: 	lam: 0.0005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 3
wandb: 	size_hidden: 32
wandb: 	spec: rand
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▆▇▇█████
epochs,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▃▂▂▁▁▁▁
val_accuracy,▁▅▆▇██████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,0.8891
epochs,10.0
loss,0.3086
val_accuracy,0.8762
val_loss,0.3513


wandb: Agent Starting Run: fmvu4q1x with config:
wandb: 	act_func: tanh
wandb: 	bs: 64
wandb: 	lam: 0.0005
wandb: 	learn_algo: nadam
wandb: 	learning_rate: 0.001
wandb: 	loss_func: cross_entropy
wandb: 	max_iterations: 10
wandb: 	no_hidden: 4
wandb: 	size_hidden: 32
wandb: 	spec: rand
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▆▇▇█████
epochs,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▇██████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
accuracy,0.8843
epochs,10.0
loss,0.3371
val_accuracy,0.8645
val_loss,0.3848


wandb: Sweep Agent: Waiting for job.
wandb: Sweep Agent: Exiting.
