## Problem 2

_Back ground functions_

In [113]:
#implement the activation function
def act_func(x, type):
    """
    Compute the activation function for a given type.

    Parameters:
    - x (numpy.ndarray): Input data.
    - type (str): Type of activation function ('sigmoid' or 'ReLU').

    Returns:
    - numpy.ndarray: Output of the activation function.
    """
    if type == "sigmoid":
        # Compute the sigmoid activation function: 1 / (1 + exp(-x))
        return 1 / (1 + np.exp(-x))
    if type == "ReLU":
        # Compute the ReLU activation function: max(0, x)
        return np.maximum(0, x)

    
#get H's [sigma(WX + b)] and the Z's [WX + b]
def feed_forward(X, nl, act:list, parameters: dict):
    """
    Perform the feedforward pass through the neural network.

    Parameters:
    - X (numpy.ndarray): Input data.
    - nl (int): Number of layers in the neural network.
    - act (list): List of activation functions for each layer.
    - parameters (dict): Dictionary containing the parameters of the neural network.

    Returns:
    - forward: Dictionary containing the forward pass computations (ZL and HL)
    """
    p = parameters
    forward = {}
    forward["H0"] = X  # Input layer is the initial value of H0
    L = nl 
    for l in range(1, L + 1):
        # Calculate the linear transformation Zl = Wl * Hl-1 + Bl
        forward["Z" + str(l)] = np.dot(p["W" + str(l)], forward["H" + str(l - 1)]) \
                               + p["B" + str(l)]
        # Apply the activation function to compute Hl
        forward["H" + str(l)] = act_func(forward["Z" + str(l)], act[l-1])
    return forward

### Part 1a

In [None]:
def MSE(y, y_pred, lambd: float, parameters: list):
    """
    Calculate the Mean Squared Error (MSE) loss function 
    with L2 penalty (Ridge Regression).

    Parameters:
    - y (numpy.ndarray): The true target values.
    - y_pred (numpy.ndarray): The predicted values.
    - lambd (float): The regularization parameter for the L2 penalty.
    - parameters (dict): A dictionary containing the parameters of the neural network.

    Returns:
    - MSE (float): The MSE loss with L2 penalty.
    """
    # Calculate L2 penalty (sum of squares of all parameters)
    # For each layer, square the parameters and then sum their sums
    # parameters.values() returns the value for each key in the dict
    L2_penalty = np.sum([np.sum(param**2) for param in parameters.values()])
    # Compute MSE with L2 penalty
    MSE = (1/2) * np.linalg.norm(y - y_pred)**2 + lambd * L2_penalty
    return MSE

def Cross_entropy(y, y_pred, lambd: float, parameters: list):
    """
    TO DO: Compute the cross entropy loss
    """

### Part 1b

__To do__... math derivation

In [115]:
def MSE_grad_wrt_y_pred(y,y_pred):
    """
    Calculate the gradient of the Mean Squared Error (MSE) loss function 
    with respect to the predicted value (last output of neural network)

    Parameters:
    - y (numpy.ndarray): The true target values.
    - y_pred (numpy.ndarray): The predicted values.

    Returns:
    - numpy.ndarray: The gradient of the MSE loss with respect to the predicted values.
    """
    mse_grad = (y_pred - y)
    return mse_grad


def Cross_entrpy_grad_wrt_y_pred(y,y_pred):
    """
    Calculate the gradient of the cross entropy loss with L2 penalty 
    """
    

### Part 2a

In [None]:
def act_derivative(type, Zl):
    """
    Compute the derivative of the activation function with respect to its input.
    
    Parameters:
    - activation (str): Type of activation function ('sigmoid' or 'ReLU').
    - Zl (numpy.ndarray): Input array to the activation function (pre-activation values).

    Returns:
    - numpy.ndarray: Derivative of the activation function evaluated at Zl.
    """
    if type == "sigmoid":
        # Compute the sigmoid of Zl
        sigmoid = 1 / (1 + np.exp(-Zl))
        # Compute the derivative of the sigmoid function
        derivative = sigmoid * (1 - sigmoid)
        return derivative
    
    if type == "ReLU":
        # Compute derivative: 1 when input > 0, 0 otherwise
        derivative = np.where(Zl > 0, 1, 0)
        return derivative

### Part 3a

In [111]:
#mine updated with doc string and comments
def backward_propagation(nl, nh, Y, lambd, p, forward, act:list):
    """
    Perform backward propagation for a neural network to compute gradients 
    for all parameters.

    Args:
    nl (int): Number of layers in the neural network.
    nh (list): an int vec of length nl - that has the number of neurons in each layer
    Y (array): Observed values we try to predict
    lambd (float): Regularization parameter.
    parameters (dict): Dict containing parameters 'W' (weights) and 'B' (biases)
    forward (dict): Dictionary containing the forward pass computations (ZL and HL).
    act (list):a str vec of length nl - the activation function used in each layer 

    Returns:
    g: A dictionary containing gradients of loss with respect to each parameter.
    """
    
    p = p
    f = forward
    L = nl
    g = {}
    
    HL = f["H"+str(L)] #Final prediction sigma(W^{L}H^{L-1} + B^{L})
    ZL = f["Z"+str(L)] #Last layer's Z (W^{L}H^{L-1} + B^{L})
    
    # Compute gradient of loss wrt last layer Z (dL_dHL*dHL_dZL)
    g["dLoss_dZ"+str(L)] = MSE_grad_wrt_y_pred(Y, HL) * act_derivative(act[-1], ZL) 
    
    # Deriative of last layer Z wrt its weights & biases (dZL_dWL, dZL_dBL)
    g["dZ"+str(L)+"_dW"+str(L)] = f["H"+str(L-1)] #just H^{L-1}
    g["dZ"+str(L)+"_dB"+str(L)] = np.ones((nh[-1], 1))#vec of 1's row's of last layer neurons
    
    # Calculate derivative with respect to weights and biases for the last layer
    # dLoss_dWL = dL_dHL*dHL_dZL*dZL_dWL, dLoss_dBL = dL_dHL*dHL_dZL*dZL_dBL
    g["dLoss_W"+str(L)] = np.dot(g["dLoss_dZ"+str(L)], 
                                 g["dZ"+str(L)+"_dW"+str(L)].T) \
                           + lambd * p["W"+str(L)]  # Include regularization term
    g["dLoss_B"+str(L)] = np.dot(g["dLoss_dZ"+str(L)],
                                 g["dZ"+str(L)+"_dB"+str(L)]) \
                           + lambd * p["B"+str(L)]  # Include regularization term
    
    #from L-1 to first layer, which is 1
    for l in reversed(range(1, L)):
        # Calculate gradient of Z in layer l+1 wrt Z in layer l
        # dZl+1_dZl = dZl+1_dHl*dHl_dZl 
        g["dZ_"+str(l+1)+"_dZ"+str(l)] = np.dot(act_derivative(act[l], f["Z"+str(l)]),
                                                p["W"+str(l+1)].T)
        
        # Propagate the loss gradient back from layer l+1 to layer l
        #dLoss_dZl = dLoss_dZl+1*dZl+1_dHl*dHl_dZl
        g["dLoss_dZ"+str(l)] = np.dot(g["dLoss_dZ"+str(l+1)], 
                                      g["dZ_"+str(l+1)+"_dZ"+str(l)])
        
        # Deriative of Z wrt its weights & biases (for each layer)
        g["dZ"+str(l)+"_dW"+str(l)] = f["H" + str(l-1)]
        g["dZ"+str(l)+"_dB"+str(l)] = np.ones((nh[l], 1))

        # Calculate derivatives with respect to weights and biases in layer l
        g["dLoss_dW"+str(l)] = np.dot(g["dLoss_dZ"+str(l)], 
                                     g["dZ"+str(l)+ "_dW"+str(l)].T) \
                               + lambd * p["W" + str(l)]
        g["dLoss_dB"+str(l)] = np.dot(g["dLoss_dZ"+str(l)], 
                                     g["dZ"+str(l)+ "_dB"+str(l)]) \
                               + lambd * p["B" + str(l)]

    return g


### Part 3b

In [None]:
def parameter(nl, nh):
    """
    Initialize the parameters (weights and biases) of a neural network.

    Parameters:
    - nl (int): Number of layers in the neural network.
    - nh (list): List containing the number of neurons in each layer.

    Returns:
    - dict: Dictionary containing the initialized parameters.
    """
    parameters = {}
    for n in range(1, nl + 1):
        # Initialize weights randomly from a uniform distribution
        parameters["W" + str(n)] = np.random.rand(nh[n], nh[n-1])
        # Initialize biases as zeros
        parameters["B" + str(n)] = np.zeros((nh[n], 1))
    return parameters

In [None]:
nl = 5
X = np.array([0.1, -0.2, 0.3, -0.4, 0.5]).reshape(-1, 1)
nh = [X.shape[0], 5, 4, 3, 5, 1]
act = ["ReLU", "sigmoid", "ReLU", "sigmoid", "ReLU"] 


In [6]:
sigma = np.array([1, 2, 3, 0, 1 , -1, -2 , -34])
np.where(sigma < 0, 0, 1)

array([1, 1, 1, 1, 1, 0, 0, 0])

In [15]:
parameters = np.array([1, 1, 3, 4])

#np.sum(parameters**2)

#np.square(parameters)
#0.05*np.sum(parameters**2)
0.05*np.sum(np.square(parameters))

1.35

In [22]:
parameters = [
    np.array([[1, 1, 3, 4],  # First matrix
              [1, 2, 3, 4]]),
    np.array([[2, 1, 3, 4],  # Second matrix
              [1, 5, 3, 4]]),
    np.array([1, 2])         # Vector
]

np.sum([np.sum(param) for param in parameters])
#for param in parameters:
#    print(np.sum(param))

45

In [18]:
np.sum(parameters)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: could not broadcast input array from shape (2,4) into shape (2,)

In [84]:
for parm in parameters.values():
    print(parm.shape)

(5, 5)
(5, 1)
(4, 5)
(4, 1)
(3, 4)
(3, 1)
(5, 3)
(5, 1)
(1, 5)
(1, 1)


In [85]:
for parm in parameters:
    print(parm)

W1
B1
W2
B2
W3
B3
W4
B4
W5
B5


In [89]:
nh[-1]

1

In [95]:
np.ones(nh[-2]).reshape(-1,1).shape

(5, 1)

In [99]:
for l in reversed(range(1, 5)):
    print(l)

4
3
2
1


In [97]:
a = [1, 2, 3, 4, 5]

In [105]:
np.ones((nh[-2], 1))

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [108]:
np.ones((nh[-2], 1)).shape

(5, 1)

In [109]:
#mine

def backward_propagation(nl, nh, Y, lambd, parameters, forward, type:list):
    
    p = parameters
    f = forward
    L = nl
    g = {}
    
    HL = f["H"+str(L)] #Final prediction
    ZL = f["Z"+str(L)] #Last layer's Z (W^{L}H^{L-1} + B^{L})
    g["dLoss_Z"+str(L)]= MSE_grad_wrt_y_pred(Y,HL)*act_derivative(type[-1], ZL)
    g["dZ"+str(L)+"_W"+str(L)] = f["H"+str(L-1)]
    g["dZ"+str(L)+"_B"+str(L)]= np.ones((nh[-1],1))
    
    
    g["dLoss_W"+str(L)]=np.dot(g["dLoss_Z"+str(L)], 
                               g["dZ"+str(L)+"_W"+str(L)].T) \
                             + lambd*(p["W"+str(L)])
    g["dLoss_B"+str(L)]=np.dot(g["dLoss_Z"+str(L)],
                               g["dZ"+str(L)+"_B"+str(L)]) \
                             + lambd*(p["B"+str(L)])
    
    
    for l in reversed(range(1, L)):
        g["dZ_"+str(l+1)+"_Z"+str(l)]= np.dot(act_derivative(type[l],f["Z"+str(l)]),
                                              p["W"+str(l+1)].T)
        
        g["dLoss_Z"+str(l)]= np.dot(g["dLoss_Z"+str(l+1)],g["dZ_"+str(l+1)+"_Z"+str(l)])
        
        
        g["dZ"+str(l)+"_W"+str(l)] = f["H" + str(l-1)]
        g["dZ"+str(l)+"_B"+str(l)]= np.ones((nh[l],1))

        
        g["dLoss_W"+str(l)]= np.dot(g["dLoss_Z"+str(l)],g["dZ" +str(l)+ "_W" +str(l)].T) \
                                    + lambd*(p["W" + str(l)])
        g["dLoss_B"+str(l)] = np.dot(g["dLoss_Z"+str(l)],g["dZ" +str(l)+ "_B"+str(l)]) \
                                    + lambd*(p["B" + str(l)])
    return g