In [136]:
import time
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


### Activation Functions

In [137]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z)) 
    return A


def softmax(z):
    # Subtract the maximum value in z for numerical stability
    shift_z = z - np.max(z, axis=0, keepdims=True)
    exp_z = np.exp(shift_z)
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def relu(Z):
    A = np.maximum(0,Z)
    return A

def tanh(x):
    return np.tanh(x)

def derivative_relu(Z):
    return np.array(Z > 0, dtype = 'float')

def derivative_tanh(x):
    return (1 - np.power(x, 2))

In [138]:
# initial_weights = pd.read_csv('Task_1/b/w-100-40-4.csv', header=None)
initial_weights = pd.read_csv('Task_1/a/w.csv', header=None)

In [139]:
# initial_biases 
initial_biases = pd.read_csv('Task_1/a/b.csv', header=None)

In [140]:
w1 = initial_weights.iloc[0:14, 1:].to_numpy().astype(np.float32).T
w2 = initial_weights.iloc[14:114, 1:41].to_numpy().astype(np.float32).T
w3 = initial_weights.iloc[114:, 1:5].to_numpy().astype(np.float32).T

In [141]:
w1.shape, w2.shape, w3.shape

((100, 14), (40, 100), (4, 40))

In [142]:
b1 = initial_biases.iloc[0, 1:].to_numpy().astype(np.float32).T
b2 = initial_biases.iloc[1, 1:41].to_numpy().astype(np.float32).T
b3 = initial_biases.iloc[2, 1:5].to_numpy().astype(np.float32).T

In [143]:
# reshape b1s
b1 = b1.reshape(-1,1)
b2 = b2.reshape(-1,1)
b3 = b3.reshape(-1,1)

### Initialize parameters

In [144]:
def initialize_parameters(layer_dims):
    
    parameters = {}
    L = len(layer_dims)            

    parameters['W' + str(1)] = w1
    parameters['b' + str(1)] = b1

    parameters['W' + str(2)] = w2
    parameters['b' + str(2)] = b2

    parameters['W' + str(3)] = w3
    parameters['b' + str(3)] = b3   
        
    return parameters

In [145]:

layer_dims = [14, 100, 40, 4]
params = initialize_parameters(layer_dims)

for l in range(1, len(layer_dims)):
    print("Shape of W" + str(l) + ":", params['W' + str(l)].shape)
    print("Shape of B" + str(l) + ":", params['b' + str(l)].shape, "\n")

Shape of W1: (100, 14)
Shape of B1: (100, 1) 

Shape of W2: (40, 100)
Shape of B2: (40, 1) 

Shape of W3: (4, 40)
Shape of B3: (4, 1) 



In [146]:
len(params)

6

### Forward Propagation

In [147]:
import numpy as np

def forward_propagation(X, parameters, activation):
    forward_cache = {}
    L = len(parameters) // 2

    # Ensure inputs are in np.float32
    forward_cache['A0'] = X.astype(np.float32)

    for l in range(1, L):
        # Matrix multiplication and addition in np.float32
        forward_cache['Z' + str(l)] = (parameters['W' + str(l)].astype(np.float32).dot(forward_cache['A' + str(l-1)]).astype(np.float32)
                                       + parameters['b' + str(l)].astype(np.float32))

        # Apply activation functions with np.float32 precision
        if activation == 'tanh':
            forward_cache['A' + str(l)] = np.tanh(forward_cache['Z' + str(l)]).astype(np.float32)
        else:
            forward_cache['A' + str(l)] = np.maximum(0, forward_cache['Z' + str(l)]).astype(np.float32)  # ReLU

    # Last layer calculation (output layer)
    forward_cache['Z' + str(L)] = (parameters['W' + str(L)].astype(np.float32).dot(forward_cache['A' + str(L-1)]).astype(np.float32)
                                   + parameters['b' + str(L)].astype(np.float32))

    # Use sigmoid or softmax for the final layer, depending on the output size
    if forward_cache['Z' + str(L)].shape[0] == 1:
        forward_cache['A' + str(L)] = 1 / (1 + np.exp(-forward_cache['Z' + str(L)])).astype(np.float32)  # Sigmoid
    else:
        exp_ZL = np.exp(forward_cache['Z' + str(L)] - np.max(forward_cache['Z' + str(L)], axis=0, keepdims=True)).astype(np.float32)  # Softmax
        forward_cache['A' + str(L)] = (exp_ZL / np.sum(exp_ZL, axis=0, keepdims=True)).astype(np.float32)

    return forward_cache['A' + str(L)], forward_cache


### Cost function

In [148]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    
    if Y.shape[0] == 1:
        cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
    else:
        cost = -(1./m) * np.sum(Y * np.log(AL))
        
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    
    return cost

In [149]:
import numpy as np

def backward_propagation(AL, Y, parameters, forward_cache, activation):
    grads = {}
    L = len(parameters) // 2
    m = AL.shape[1]
    
    # Ensure AL and Y are in np.float32
    AL = AL.astype(np.float32)
    Y = Y.astype(np.float32)
    
    # Output layer gradient
    grads["dZ" + str(L)] = AL - Y
    grads["dW" + str(L)] = (1. / m * np.dot(grads["dZ" + str(L)], forward_cache['A' + str(L-1)].T)).astype(np.float32)
    grads["db" + str(L)] = (1. / m * np.sum(grads["dZ" + str(L)], axis=1, keepdims=True)).astype(np.float32)
    
    # Hidden layers gradients
    for l in reversed(range(1, L)):
        # Cast parameters and forward_cache to np.float32
        W_next = parameters['W' + str(l+1)].astype(np.float32)
        A_l = forward_cache['A' + str(l)].astype(np.float32)
        
        if activation == 'tanh':
            grads["dZ" + str(l)] = (np.dot(W_next.T, grads["dZ" + str(l+1)]) * derivative_tanh(A_l)).astype(np.float32)
        else:
            grads["dZ" + str(l)] = (np.dot(W_next.T, grads["dZ" + str(l+1)]) * derivative_relu(A_l)).astype(np.float32)
        
        grads["dW" + str(l)] = (1. / m * np.dot(grads["dZ" + str(l)], forward_cache['A' + str(l-1)].T)).astype(np.float32)
        grads["db" + str(l)] = (1. / m * np.sum(grads["dZ" + str(l)], axis=1, keepdims=True)).astype(np.float32)

    return grads


In [150]:
def update_parameters(parameters, grads, learning_rate):

    L = len(parameters) // 2 
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
        
    return parameters

In [151]:
def predict(X, y, parameters, activation):

    m = X.shape[1]
    y_pred, caches = forward_propagation(X, parameters, activation)
    
    if y.shape[0] == 1:
        y_pred = np.array(y_pred > 0.5, dtype = 'float')
    else:
        y = np.argmax(y, 0)
        y_pred = np.argmax(y_pred, 0)
    
    return np.round(np.sum((y_pred == y)/m), 2)

In [152]:
def model(X, Y, layers_dims, learning_rate = 0.03, activation = 'relu', num_iterations = 3000):

    np.random.seed(1)
    costs = []              
    
    parameters = initialize_parameters(layers_dims)
    # print("parameters", parameters)

    for i in range(0, num_iterations):

        AL, forward_cache = forward_propagation(X, parameters, activation)

        cost = compute_cost(AL, Y)
        costs.append(cost)

        grads = backward_propagation(AL, Y, parameters, forward_cache, activation)

        parameters = update_parameters(parameters, grads, learning_rate)
        

       
    return parameters,grads,costs

In [153]:
X=[-1, 1, 1, 1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1]
y = [0,0,0,1]

# make this suitable for input as NN
X_train = np.array(X).reshape(-1, 1)
Y_train = np.array(y).reshape(-1, 1)

print(X_train.shape, Y_train.shape)

layers_dims = [14, 100, 40, 4]
lr = 0.01
iters = 1

(14, 1) (4, 1)


In [154]:
parameters ,grads , costs= model(X_train, Y_train, layers_dims, learning_rate = lr, activation = 'relu', num_iterations = iters)

  cost = -(1./m) * np.sum(Y * np.log(AL))
  cost = -(1./m) * np.sum(Y * np.log(AL))


In [155]:
parameters

{'W1': array([[ 0.47143516,  0.29120535, -0.3195614 , ...,  1.571396  ,
         -0.47867355,  1.040699  ],
        [-1.1379457 ,  0.5135037 , -0.6730231 , ..., -1.352877  ,
         -0.47195536, -0.44259822],
        [ 1.432707  ,  0.5035918 ,  0.15699838, ...,  0.00726464,
         -1.7827756 ,  2.4786806 ],
        ...,
        [ 0.5774469 ,  0.8399987 , -0.41316873, ..., -2.0712829 ,
          2.056625  , -0.9128299 ],
        [-1.0452063 ,  0.2645943 , -1.5771248 , ..., -2.1685112 ,
         -1.7946546 , -0.07050197],
        [-0.59940195, -0.96401834, -0.23177716, ..., -0.19912186,
         -1.4801961 , -0.6633142 ]], dtype=float32),
 'b1': array([[-0.63515496],
        [-0.73405015],
        [-1.5015857 ],
        [-0.37922004],
        [-0.35299936],
        [ 0.1273844 ],
        [ 0.59761   ],
        [ 0.75405234],
        [-0.7267511 ],
        [-0.6326859 ],
        [-0.55704755],
        [-0.6869202 ],
        [ 0.20931008],
        [ 0.47860023],
        [ 0.24285674],
 

In [156]:
grads
# make all grads np.float32 type
# Convert all arrays to np.float32
grads_dict = grads

In [157]:
grads_dict['dW1'] = grads_dict['dW1'].T
grads_dict['dW2'] = grads_dict['dW2'].T
grads_dict['dW3'] = grads_dict['dW3'].T

In [158]:
grads_dict['dW1'].shape , grads_dict['dW2'].shape, grads_dict['dW3'].shape

((14, 100), (100, 40), (40, 4))

In [159]:
# Prepare lists to hold rows and keys
rows = []

# Iterate over the dictionary
for key, value in grads_dict.items():
    
    if(key[1] == 'W'):
        print(key)
        for row in value:
            print(row)
            rows.append(row)  # Append the 1D array (row)
        # Track which key this row belongs to

rows.reverse()
# Create DataFrame
df_dws = pd.DataFrame(rows)

df_dws.shape

dW3
[  0.         0.        40.288425 -40.288425]
[0. 0. 0. 0.]
[  0.         0.        51.398563 -51.398563]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[ 0.         0.         2.6145859 -2.6145859]
[ 0.         0.         0.5414607 -0.5414607]
[  0.         0.        14.588455 -14.588455]
[ 0.          0.          0.96590817 -0.96590817]
[  0.         0.        12.956019 -12.956019]
[  0.         0.        32.953403 -32.953403]
[  0.        0.       57.71453 -57.71453]
[ 0.         0.         3.8816016 -3.8816016]
[0. 0. 0. 0.]
[ 0.         0.         5.7341833 -5.7341833]
[0. 0. 0. 0.]
[ 0.         0.         4.5943832 -4.5943832]
[  0.         0.        41.362995 -41.362995]
[  0.        0.       17.51129 -17.51129]
[0. 0. 0. 0.]
[  0.         0.        36.190296 -36.190296]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[  0.         0.        19.493435 -19.493435]
[0. 0. 0. 0.]
[  0.         0.        21.197317 -21.197317]
[  0.         0.        29.160652 -29.160652]
[  0.         0.        40.050423 -40.050423]


(154, 100)

In [160]:
df_dws.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,5.303,0.0,1.029493,0.0,0.0,-8.007869,-0.62155,-8.262645,0.0,...,-4.95448,-8.113829,5.106945,10.410108,0.0,1.823631,4.100733,4.915877,2.378255,-8.752063
1,0.0,5.303,0.0,1.029493,0.0,0.0,-8.007869,-0.62155,-8.262645,0.0,...,-4.95448,-8.113829,5.106945,10.410108,0.0,1.823631,4.100733,4.915877,2.378255,-8.752063
2,0.0,-5.303,0.0,-1.029493,0.0,0.0,8.007869,0.62155,8.262645,0.0,...,4.95448,8.113829,-5.106945,-10.410108,0.0,-1.823631,-4.100733,-4.915877,-2.378255,8.752063
3,0.0,-5.303,0.0,-1.029493,0.0,0.0,8.007869,0.62155,8.262645,0.0,...,4.95448,8.113829,-5.106945,-10.410108,0.0,-1.823631,-4.100733,-4.915877,-2.378255,8.752063
4,0.0,5.303,0.0,1.029493,0.0,0.0,-8.007869,-0.62155,-8.262645,0.0,...,-4.95448,-8.113829,5.106945,10.410108,0.0,1.823631,4.100733,4.915877,2.378255,-8.752063


In [161]:
dbs = {
    'db1': grads_dict['db1'].flatten(),
    'db2': grads_dict['db2'].flatten(),
    'db3': grads_dict['db3'].flatten()
}



In [162]:
# Convert dbs to DataFrame and save to CSV
df_dbs = pd.DataFrame.from_dict(dbs, orient='index')
df_dbs.to_csv('./answer/db.csv', header=False, index=False)


df_dws.to_csv('./answer/dw.csv', header=False, index=False)

In [163]:
df_dbs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
db1,0.0,5.303,0.0,1.029493,0.0,0.0,-8.007869,-0.62155,-8.262645,0.0,...,-4.95448,-8.113829,5.106945,10.410108,0.0,1.823631,4.100733,4.915877,2.378255,-8.752063
db2,-0.453929,0.0,0.535562,0.0,0.0,-0.078542,2.995674,-0.703422,0.372048,2.064679,...,,,,,,,,,,
db3,0.0,0.0,1.0,-1.0,,,,,,,...,,,,,,,,,,
