In [5]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
np.random.seed(42)

#1) Forward propagation process:

In [7]:
def initialize_parameters(layer_dims): 
    '''
    input: an array of the dimensions of each layer in the network (layer 0 is the size of the flattened input, layer L is the output softmax)

    output: a dictionary containing the initialized W and b parameters of each layer (W1…WL, b1…bL).
    '''
    initialized_parameters = {'W':[], 'b':[]}
    for layer_num in range(1, len(layer_dims)):
        mu, sigma = 0, 0.5#1, 0.1 #as learn in class - we want values between one 
        initialized_parameters['W'] += [np.random.randn(layer_dims[layer_num], layer_dims[layer_num-1])*sigma+mu]  #layer, layer_dims[layer_num-1]: dimension of this layer, dimension of the previous layer
        initialized_parameters['b'] += [np.zeros((layer_dims[layer_num], 1))]
    return initialized_parameters

In [8]:
def linear_forward(A, W, b): 
    '''
    Description: Implement the linear part of a layer's forward propagation.

    input: 
    A – the activations of the previous layer
    W – the weight matrix of the current layer (of shape [size of current layer, size of previous layer])
    B – the bias vector of the current layer (of shape [size of current layer, 1])

    Output:
    Z – the linear component of the activation function (i.e., the value before applying the non-linear function)
    linear_cache – a dictionary containing A, W, b (stored for making the backpropagation easier to compute)
    '''
    Z = np.dot(W, A) + b
    linear_cache = {'W':W, 'A':A, 'b':b}
    return Z, linear_cache

In [9]:
def softmax(Z): 
    '''
    Input:
    Z – the linear component of the activation function

    Output:
    A – the activations of the layer
    activation_cache – returns Z, which will be useful for the backpropagation
    '''
    A = np.exp(Z)/sum(np.exp(Z))
    activation_cache = {'Z':Z}
    return A, activation_cache

In [10]:
def relu(Z): 
    '''
    Input:
    Z – the linear component of the activation function

    Output:
    A – the activations of the layer
    activation_cache – returns Z, which will be useful for the backpropagation
    '''
    A = np.maximum(0, Z)
    activation_cache = {'Z':Z}
    return A, activation_cache

In [11]:
def linear_activation_forward(A_prev, W, B, activation):
    '''
    Description:
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Input:
    A_prev – activations of the previous layer
    W – the weights matrix of the current layer
    B – the bias vector of the current layer
    Activation – the activation function to be used (a string, either “softmax” or “relu”)

    Output:
    A – the activations of the current layer
    cache – a joint dictionary containing both linear_cache and activation_cache
    '''
    Z, linear_cache = linear_forward(A_prev, W, B) #do linear forward
    if activation=="softmax":
        A, activation_cache = softmax(Z)
    elif activation=="relu":
        A, activation_cache = relu(Z)
    cache = {}
    cache.update(linear_cache)
    cache.update(activation_cache)
    return A, cache

In [12]:
def L_model_forward(X, parameters, use_batchnorm): 
    '''
    Description:
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SOFTMAX computation

    Input:
    X – the data, numpy array of shape (input size, number of examples)
    parameters – the initialized W and b parameters of each layer
    use_batchnorm - a boolean flag used to determine whether to apply batchnorm after the activation (note that this option needs to be set to “false” in Section 3 and “true” in Section 4).

    Output:
    AL – the last post-activation value
    caches – a list of all the cache objects generated by the linear_forward function
    '''
    cur_A = X
    layers_amount = len(parameters['W'])
    caches = []
    #[LINEAR->RELU]*(L-1) :
    for layer_num in range(layers_amount-1):
        A_prev = cur_A
        if use_batchnorm: A_prev = apply_batchnorm(A_prev)

        W = parameters['W'][layer_num]
        B = parameters['b'][layer_num]
        activation="relu"
        cur_A, cache = linear_activation_forward(A_prev, W, B, activation)
        caches += [cache]
    #LINEAR->SOFTMAX :
    layer_num = layers_amount-1
    W = parameters['W'][layer_num]
    B = parameters['b'][layer_num]
    activation="softmax"
    AL, cache = linear_activation_forward(cur_A, W, B, activation)
    caches += [cache]
    return AL, caches

In [13]:
def compute_cost(AL, Y): 
    '''
    Description:
    Implement the cost function defined by equation. The requested cost function is categorical cross-entropy loss. 

    Input:
    AL – probability vector corresponding to your label predictions, shape (num_of_classes, number of examples)
    Y – the labels vector (i.e. the ground truth)

    Output:
    cost – the cross-entropy cost
    '''
    eps = 1e-20
    m = AL.shape[1]
    reshaped_Y = np.zeros(AL.shape)
    for col, ind_of_one in enumerate(Y):
        reshaped_Y[ind_of_one, col] = 1
    log_res = np.log(AL + eps)
    cost = - (1 / m) *  np.sum(np.multiply(log_res, reshaped_Y))#element-wise multiplication and summation  
    return np.average(cost)

In [14]:
def apply_batchnorm(A):
    '''
    Description:
    performs batchnorm on the received activation values of a given layer.

    Input:
    A - the activation values of a given layer

    output:
    NA - the normalized activation values, based on the formula learned in class
    '''
    mean = np.array([[itm] for itm in A.mean(axis=1)])
    var = np.array([[itm] for itm in A.var(axis=1)])
    epsilon = 1e-20
    NA = (A - mean) / np.sqrt(var + epsilon)

    return NA

#2) Backward propagation process:

In [15]:
def	Linear_backward(dZ, cache):
    '''
    description:
    Implements the linear part of the backward propagation process for a single layer

    Input:
    dZ – the gradient of the cost with respect to the linear output of the current layer (layer l)
    cache – tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Output:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    '''
    A_prev, W, b = cache['A'], cache['W'], cache['b']
    m = A_prev.shape[1]
    dA_prev = np.dot(W.T, dZ)
    dW = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis=1)

    return dA_prev, dW, db

In [16]:
def linear_activation_backward(dA, cache, activation):
    '''
    Description:
    Implements the backward propagation for the LINEAR->ACTIVATION layer. The function first computes dZ and then applies the linear_backward function.

    Input:
    dA – post activation gradient of the current layer
    cache – contains both the linear cache and the activations cache

    Output:
    dA_prev – Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW – Gradient of the cost with respect to W (current layer l), same shape as W
    db – Gradient of the cost with respect to b (current layer l), same shape as b
    '''
    # (linear_cache, activation_cache) = cache

    if activation=="softmax":
        dZ = softmax_backward(dA, cache)
    elif activation=="relu":
        dZ = relu_backward(dA, cache)

    dA_prev, dW, db = Linear_backward(dZ, cache) #do linear backward
    
    return dA_prev, dW, db

In [17]:
def relu_backward(dA, activation_cache): 
    '''
    Description:
    Implements backward propagation for a ReLU unit

    Input:
    dA – the post-activation gradient
    activation_cache – contains Z (stored during the forward propagation)

    Output:
    dZ – gradient of the cost with respect to Z
    '''
    Z = activation_cache['Z']
    relu_dev = np.greater(Z, 0).astype(int)
    return dA * relu_dev

In [18]:
def softmax_backward (dA, activation_cache):
    '''
    Description:
    Implements backward propagation for a softmax unit

    Input:
    dA – the post-activation gradient
    activation_cache – contains Z (stored during the forward propagation)

    Output:
    dZ – gradient of the cost with respect to Z
    '''
    dZ = dA*1
    return dZ 

In [19]:
def L_model_backward(AL, Y, caches):
    '''
    Description:
    Implement the backward propagation process for the entire network.

    Some comments:
    the backpropagation for the softmax function should be done only once as only the output layers uses it and the RELU should be done iteratively over all the remaining layers of the network. 

    Input:
    AL - the probabilities vector, the output of the forward propagation (L_model_forward)
    Y - the true labels vector (the "ground truth" - true classifications)
    Caches - list of caches containing for each layer: a) the linear cache; b) the activation cache

    Output:
    Grads - a dictionary with the gradients
                grads["dA" + str(l)] = ... 
                grads["dW" + str(l)] = ...
                grads["db" + str(l)] = ...
    '''
    Grads = {}
    layers_amount = len(caches)
    #reshape y to be as AL:
    reshaped_Y = np.zeros(AL.shape)
    for col, ind_of_one in enumerate(Y):
        reshaped_Y[ind_of_one, col] = 1

    dA = AL - reshaped_Y

    for i in range(layers_amount-1, -1, -1):
        cur_cache = caches[i]
        activation = "softmax" if i==layers_amount-1 else "relu"
        dA, dW, db = linear_activation_backward(dA, cur_cache, activation)
        Grads["dA" + str(i)] = dA
        Grads["dW" + str(i)] = dW 
        Grads["db" + str(i)] = db 
    return Grads

In [20]:
def Update_parameters(parameters, grads, learning_rate): 
    '''
    Description:
    Updates parameters using gradient descent

    Input:
    parameters – a python dictionary containing the DNN architecture’s parameters
    grads – a python dictionary containing the gradients (generated by L_model_backward)
    learning_rate – the learning rate used to update the parameters (the “alpha”)

    Output:
    parameters – the updated values of the parameters object provided as input
    '''
    wegihts, bias = parameters["W"], parameters["b"]
    for i in range(len(parameters['W'])):
            wegihts[i] -= learning_rate * grads['dW' + f'{str(i)}']
            reshaped_db = np.array([[itm] for itm in grads["db" + str(i)]])
            bias[i] -= learning_rate * reshaped_db

    return parameters

#3) Use the functions created in the previous sections to train the network and produce predictions

In [21]:
def L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, batch_size, l2_regularization=False):
    '''
    Description:
    Implements a L-layer neural network. 
    All layers but the last should have the ReLU activation function, 
    and the final layer will apply the softmax activation function. 
    The size of the output layer should be equal to the number of labels in the data. 
    Please select a batch size that enables your code to run well 
    (i.e. no memory overflows while still running relatively fast).

    Input:
    X – the input data, a numpy array of shape (height*width , number_of_examples) 
    Comment: since the input is in grayscale we only have height and width, otherwise it would have been height*width*3
    Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    Layer_dims – a list containing the dimensions of each layer, including the input
    batch_size – the number of examples in a single training batch.

    Output:
    parameters – the parameters learnt by the system during the training (the same parameters that were updated in the update_parameters function).
    costs – the values of the cost function (calculated by the compute_cost function). 
    One value is to be saved after each 100 training iterations (e.g. 3000 iterations -> 30 values).
    '''
    def batch_gen(x, y, batch_size):
        suffeld_indices = list(range(x.shape[1]))
        np.random.shuffle(suffeld_indices)

        for i in range(0, x.shape[1], batch_size):
            generated_batch_indices = suffeld_indices[i:i + batch_size]
            yield x[:, generated_batch_indices], y[generated_batch_indices]

    parameters = initialize_parameters(layers_dims) 
    iterations_amount = 0
    costs = []

    (x_train, y_train), (x_valid, y_valid) = get_splited_mnist(X, Y) #split data

    last_valid_accuracy = 0
    valid_improvment = True
    train_iterations = 1

    min_valid_improvment = 0.0001

    while valid_improvment and train_iterations<num_iterations:
        for i in range(1, 10 + 1):
            curr_iteration_mean_loss = []
            curr_iteration_valid_accs = []

            for batch_num, (batch, y) in enumerate(batch_gen(x_train, y_train, batch_size)):
                AL, caches = L_model_forward(batch, parameters, use_batchnorm=use_batcnormalization)
                if l2_regularization: cost = compute_cost(AL, y, parameters) #add this for the bonus
                else: cost = compute_cost(AL, y)
                grads = L_model_backward(AL, y, caches)
                parameters = Update_parameters(parameters, grads, learning_rate)
                curr_iteration_mean_loss.append(cost)

                if (batch_num+1) % 100 == 0:
                    costs.append(cost)
                    iterations_amount += 1
                    valid_acc = Predict(x_valid, y_valid, parameters)
                    curr_iteration_valid_accs.append(valid_acc)


            mean_cost_for_epoch = np.average(curr_iteration_mean_loss)
            mean_accuracy_for_epoch = np.average(curr_iteration_valid_accs)

            #check stoping criterion
            valid_improvment = mean_accuracy_for_epoch>=last_valid_accuracy + min_valid_improvment
            last_valid_accuracy = mean_accuracy_for_epoch

            print('-------------------------------------------------------------------')
            print(f"Iter: {train_iterations}, epoch {i} cost {np.round(mean_cost_for_epoch, 3)}, validation accuracy: {np.round(mean_accuracy_for_epoch*100, 3)}%")



            if not valid_improvment: break
                

        train_iterations += 1

    return parameters, costs

In [22]:
def forward_and_pred(X, parameters):
    #Auxilary function to get predictions for data set X
    y_hat, _ = L_model_forward(X, parameters, use_batchnorm=use_batcnormalization)  # forward
    preds = np.argmax(y_hat, axis=0)#prediction is argument with the hughest confidence score
    return preds

In [23]:
def Predict(X, Y, parameters):
    '''
    Description:
    The function receives an input data and the true labels and calculates 
    the accuracy of the trained neural network on the data.

    Input:
    X – the input data, a numpy array of shape (height*width, number_of_examples)
    Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    Parameters – a python dictionary containing the DNN architecture’s parameters

    Output:
    accuracy – the accuracy measure of the neural net on the provided data 
    (i.e. the percentage of the samples for which the correct label receives the hughest confidence score). 
    Use the softmax function to normalize the output values.
    '''
    preds = forward_and_pred(X, parameters)    #get model output
    right_preds = np.sum(preds == Y) #how many correct labels
    accuracy = right_preds/Y.shape[0] #accuracy is the percentage of the samples for which the correct label receives the hughest confidence score
    return accuracy

#4) Use the code you wrote to classify the MNIST dataset and present a summary report


## a.	
You may use publicly available code to download and preprocess the data. Note that there is a predefined division between the train and test set. Use 20% of the training set as a validation set (samples need to be randomly chosen)

In [24]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], -1).T / 255
x_test = x_test.reshape(x_test.shape[0], -1).T / 255

def get_splited_mnist(x_train, y_train):
    #shuffle the train indices and make validation set:
    train_ind = np.arange(x_train.shape[1]) #take train indices
    np.random.shuffle(train_ind) #shuffle train indices
    validation_size = int(len(train_ind)*0.2)
    validation_ind, new_train_ind = train_ind[:validation_size], train_ind[validation_size:] #compute train, validation indices
    #set new_train and validation set due to the above partition 
    (x_validation, y_validation) = (x_train[:,validation_ind], y_train[validation_ind])
    (x_train, y_train) = (x_train[:,new_train_ind], y_train[new_train_ind])
    #reshape all :

    # x_validation = x_validation.reshape(784, x_validation.shape[0])/255
    return (x_train, y_train), (x_validation, y_validation)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


## b.	Run your network using the following configuration:<br/>

•	4 layers (aside from the input layer), with the following sizes: 20,7,5,10<br/>

•	Do not activate the batchnorm option at this point<br/>

•	The input at each iteration needs to be “flattened” to a matrix of [m,784], where m is the number of samples<br/>

•	Use a learning rate of 0.009<br/>

•	Train the network until there is no improvement on the validation set (or the improvement is very small) for 100 training steps (this is the stopping criterion). Please include in the report the number of iterations and epochs needed to train your network. Also, specify the batch size.

In [25]:

use_batcnormalization = False
batch_size = 64
parameters, costs = L_layer_model(x_train, y_train,
                                  layers_dims=[784, 20, 7, 5, 10],
                                  learning_rate=0.009,
                                  num_iterations=100,
                                  batch_size=batch_size)

(x_train_for_accuracy, y_train_for_accuracy), (x_valid_for_accuracy, y_valid_for_accuracy) = get_splited_mnist(x_train, y_train)  # split with 20 percent validation

print('Results summary:')
print('Batch size =', batch_size)

train_acc = Predict(x_train_for_accuracy, y_train_for_accuracy, parameters)
print(f'Train accuracy: {np.round(train_acc*100, 3)}%')

valid_acc = Predict(x_valid_for_accuracy, y_valid_for_accuracy, parameters)
print(f'Validation accuracy: {np.round(valid_acc*100, 3)}%')

test_acc = Predict(x_test, y_test, parameters)
print(f'Test accuracy: {np.round(test_acc*100, 3)}%')

-------------------------------------------------------------------
Iter: 1, epoch 1 cost 2.264, validation accuracy: 20.0%
-------------------------------------------------------------------
Iter: 1, epoch 2 cost 1.986, validation accuracy: 28.344%
-------------------------------------------------------------------
Iter: 1, epoch 3 cost 1.861, validation accuracy: 32.606%
-------------------------------------------------------------------
Iter: 1, epoch 4 cost 1.765, validation accuracy: 35.031%
-------------------------------------------------------------------
Iter: 1, epoch 5 cost 1.668, validation accuracy: 39.182%
-------------------------------------------------------------------
Iter: 1, epoch 6 cost 1.6, validation accuracy: 40.83%
-------------------------------------------------------------------
Iter: 1, epoch 7 cost 1.545, validation accuracy: 43.287%
-------------------------------------------------------------------
Iter: 1, epoch 8 cost 1.49, validation accuracy: 46.264

## c.	Please include the following details in your report:<br/>
•	The final accuracy values for the train, validation and test sets.<br/>
•	The cost value for each 100 training steps. Please make sure that the index of the training step will also be included in the report. Print the values from the L_layer_model<br/><br/>
## d.	All the information requested above will be included in a .docx file uploaded with the code.

#5) 
Repeat section 4 when the batchnorm function is “on”. Analyze and compare this experiment to the previous one (performance, running time, number of training steps etc.). There is no need to update the parameters of the batchnorm in the way described in the lecture (that is, use z_norm^i and not z ̃^i).

In [26]:
use_batcnormalization = True

parameters_batcnorm, costs = L_layer_model(x_train, y_train,
                                  layers_dims=[784, 20, 7, 5, 10],
                                  learning_rate=0.009,
                                  num_iterations=100,
                                  batch_size=batch_size)

print('Results summary:')
print('Batch size =', batch_size)

train_acc = Predict(x_train_for_accuracy, y_train_for_accuracy, parameters_batcnorm)
print(f'Train accuracy: {np.round(train_acc*100, 3)}%')

valid_acc = Predict(x_valid_for_accuracy, y_valid_for_accuracy, parameters_batcnorm)
print(f'Validation accuracy: {np.round(valid_acc*100, 3)}%')

test_acc = Predict(x_test, y_test, parameters_batcnorm)
print(f'Test accuracy: {np.round(test_acc*100, 3)}%')

-------------------------------------------------------------------
Iter: 1, epoch 1 cost 1.973, validation accuracy: 31.249%
-------------------------------------------------------------------
Iter: 1, epoch 2 cost 1.552, validation accuracy: 45.976%
-------------------------------------------------------------------
Iter: 1, epoch 3 cost 1.284, validation accuracy: 57.006%
-------------------------------------------------------------------
Iter: 1, epoch 4 cost 1.099, validation accuracy: 64.694%
-------------------------------------------------------------------
Iter: 1, epoch 5 cost 0.968, validation accuracy: 71.957%
-------------------------------------------------------------------
Iter: 1, epoch 6 cost 0.85, validation accuracy: 78.11%
-------------------------------------------------------------------
Iter: 1, epoch 7 cost 0.744, validation accuracy: 81.748%
-------------------------------------------------------------------
Iter: 1, epoch 8 cost 0.649, validation accuracy: 84

#6)	
Bonus 10%: modify the code so that it supports the L2 norm functionality. In addition to the code, please provide a short explanation about the changes done in the code. Compare the values of the weights of your architecture with and without this change.

#Solution:
in order to support L2 regularization, we need to add the regularization term to the loss fuction and to update the parameters accordingly, so we will change the functions:
compute_cost, and update_parameters

In [51]:
def compute_cost(AL, Y, parameters=None, regularization_param=1): 
    '''
    Description:
    same as 'compute_cost' before but adding regularization with the parameter - regularization_adding
    '''
    regularization_adding = 0
    if parameters:
        regularization_adding = np.sum([np.sum(np.square(weights)) for weights in parameters['W']])
    eps = 1e-20
    m = AL.shape[1]
    reshaped_Y = np.zeros(AL.shape)
    for col, ind_of_one in enumerate(Y):
        reshaped_Y[ind_of_one, col] = 1
    log_res = np.log(AL + eps)
    cost = - (1 / m) *  np.sum(np.multiply(log_res, reshaped_Y))#element-wise multiplication and summation  
    return (regularization_param/(2*m)) * regularization_adding + np.average(cost)

In [52]:
def Update_parameters(parameters, grads, learning_rate, regularization_param=1): 
    '''
    Description:
    Updates parameters using gradient descent

    Input:
    parameters – a python dictionary containing the DNN architecture’s parameters
    grads – a python dictionary containing the gradients (generated by L_model_backward)
    learning_rate – the learning rate used to update the parameters (the “alpha”)

    Output:
    parameters – the updated values of the parameters object provided as input
    '''

    wegihts, bias = parameters["W"], parameters["b"]

    for i in range(len(parameters['W'])):
            reg_add = (regularization_param/batch_size) * wegihts[i]
            wegihts[i] -= learning_rate * (grads['dW' + f'{str(i)}'] + reg_add)
            reshaped_db = np.array([[itm] for itm in grads["db" + str(i)]])
            bias[i] -= learning_rate * reshaped_db

    parameters["W"], parameters["b"] = wegihts, bias
    return parameters

In [53]:
use_batcnormalization = False

parameters_l2, costs_l2 = L_layer_model(x_train, y_train,
                                  layers_dims=[784, 20, 7, 5, 10],
                                  learning_rate=0.009,
                                  num_iterations=100,
                                  batch_size=batch_size,
                                  l2_regularization=True)

print('Results summary:')
print('Batch size =', batch_size)

train_acc = Predict(x_train_for_accuracy, y_train_for_accuracy, parameters_l2)
print(f'Train accuracy: {np.round(train_acc*100, 3)}%')

valid_acc = Predict(x_valid_for_accuracy, y_valid_for_accuracy, parameters_l2)
print(f'Validation accuracy: {np.round(valid_acc*100, 3)}%')

test_acc = Predict(x_test, y_test, parameters_l2)
print(f'Test accuracy: {np.round(test_acc*100, 3)}%')

-------------------------------------------------------------------
Iter: 1, epoch 1 cost 30.092, validation accuracy: 11.792%
-------------------------------------------------------------------
Iter: 1, epoch 2 cost 24.735, validation accuracy: 15.188%
-------------------------------------------------------------------
Iter: 1, epoch 3 cost 20.373, validation accuracy: 19.43%
-------------------------------------------------------------------
Iter: 1, epoch 4 cost 16.779, validation accuracy: 25.288%
-------------------------------------------------------------------
Iter: 1, epoch 5 cost 13.821, validation accuracy: 33.686%
-------------------------------------------------------------------
Iter: 1, epoch 6 cost 11.235, validation accuracy: 46.258%
-------------------------------------------------------------------
Iter: 1, epoch 7 cost 9.019, validation accuracy: 60.831%
-------------------------------------------------------------------
Iter: 1, epoch 8 cost 7.35, validation accura

In [59]:
parameters['W'][0]

array([[ 0.24835708, -0.06913215,  0.32384427, ..., -0.66767218,
         0.19009893,  0.30529287],
       [ 0.27989522,  0.54039036,  0.41696108, ..., -0.14066378,
         0.03349536,  0.25796961],
       [-0.78127293, -0.26452634,  0.39713234, ..., -0.19600632,
         0.52996819,  0.30850297],
       ...,
       [ 0.02769846,  0.21752419, -0.41420161, ..., -1.11296771,
        -0.39377572, -0.23497894],
       [-0.11799005, -0.24497201, -1.27501315, ..., -0.81072631,
        -0.3751294 ,  0.2941184 ],
       [ 0.36096063, -0.32585207, -0.06707666, ..., -0.47945705,
        -0.36671258, -0.00550316]])

In [58]:
parameters_l2['W'][0]

array([[ 0.00103693, -0.00202308,  0.00057438, ..., -0.01789345,
         0.01167333,  0.01206085],
       [-0.01791017,  0.00131053,  0.01696928, ...,  0.02101566,
         0.00231083, -0.00573269],
       [-0.02229284,  0.00412522,  0.00324807, ...,  0.02210704,
        -0.01299997,  0.00461194],
       ...,
       [ 0.00267106, -0.00673723, -0.00825559, ...,  0.00824123,
         0.0119845 ,  0.01926042],
       [ 0.00041538, -0.00593797, -0.01522488, ...,  0.01309775,
        -0.00405748,  0.02241561],
       [-0.00826079, -0.00404636,  0.02323034, ...,  0.00429275,
         0.00746295, -0.02402202]])