# Creating a simple NN Model

In [1]:
##Activation Function
import numpy as np
np.random.seed(1)
X=np.random.randn(2,3)
print(X)

[[ 1.62434536 -0.61175641 -0.52817175]
 [-1.07296862  0.86540763 -2.3015387 ]]


In [None]:
#The sigmoid function takes any real number as input and transforms it into a value between 0 and 1.
#The output of the sigmoid function, often denoted as σ(z) or simply s, represents the probability
#that a given input belongs to the positive class in a binary classification problem. When z is 
#positive and large, σ(z) approaches 1. When z is negative and large, σ(z) approaches 0. 

In [2]:
#1.Sigmoid Function
def sigmoid(z):
        s=1/(1+np.exp(-z))
        return s

In [3]:
s=sigmoid(X)  #sigmoid function will be applied element-wise to each element of the array.
print(s)     #contains the sigmoid values computed from the elements of matrix X.

[[0.83539354 0.35165864 0.3709434 ]
 [0.25483894 0.70378922 0.09099561]]


In [None]:
#The hyperbolic tangent function, often denoted as tanh⁡(z), is another activation function that
#maps input values to the range between -1 and 1.
#useful when the data distribution has both positive and negative values and centerd around zero.

In [4]:
#2. tanh Function
def tan(z):    
    s = (np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
    return s

In [5]:
t=tan(X)
print(t)

[[ 0.92525207 -0.5453623  -0.48398233]
 [-0.79057703  0.69903334 -0.98015695]]


In [6]:
#3. relu Function
def relu(z):
    s = np.maximum(0,z)
    return s

In [None]:
#if the input value z is positive, the output value s will be equal to z; otherwise, it will be 
#set to 0. The np.maximum function from the NumPy library is used to element-wise compare each value
#in the input array z with 0 and take the maximum.

#It's known for its simplicity and effectiveness in addressing the vanishing gradient problem. When 
#the input is positive, ReLU maintains the gradient as 1, which prevents the gradients from vanishing 
#during backpropagation.

In [8]:
r=relu(X)
print(r)

[[1.62434536 0.         0.        ]
 [0.         0.86540763 0.        ]]


In [None]:
#Leaky relu:-

#if the input value z is positive, the output value s will be equal to z; if z is negative, the 
#output value s will be 0.01z. The np.maximum function from the NumPy library is used to perform
#the element-wise comparison and take the maximum value.

In [None]:
# It aims to address the issue of "dying ReLUs," where neurons can become inactive during training by 
#setting the output to 0 for all inputs. The leaky parameter (0.01 in this case) introduces a small 
#negative slope for the negative values of z, allowing some gradient to flow even for negative inputs.
#This helps prevent neurons from becoming completely inactive and can help alleviate vanishing 
#gradient issues to some extent.

In [16]:
#4. leaky relu Function
def lrelu(z):
    s = np.maximum(0.01*z,z)   #0.01 is the "leakiness" parameter
    return s

In [29]:
ler=lrelu(X)
print(ler)

[[1.62434536 0.01       0.01      ]
 [0.01       0.86540763 0.01      ]]


In [44]:
#Softmax Function
def softmax(vector):          
    e = np.exp(vector)    #compute the exponential function element-wise.
    s= e / e.sum()     #computes the softmax values by dividing each exponential value by the sum of all exponential values.
    return s

In [None]:
#s is an array where each element represents the probability of the corresponding element in the input
#vector.
#commonly used in classification tasks, especially in multi-class classification scenarios. It takes 
#a vector of raw scores (logits) and transforms them into a probability distribution over multiple 
#classes. The softmax function ensures that the output values are between 0 and 1 and that they sum 
#up to 1, making them suitable as probabilities. The class with the highest probability becomes the 
#predicted class.

In [49]:
sof=softmax(X)
print(sof)

[[0.56232214 0.06009779 0.06533695]
 [0.03789279 0.26325869 0.01109163]]


In [None]:
#The derivative helps propagate gradients backward through the network to adjust the weights and 
#biases during training.

In [18]:
##derivatives of Activation Function
#1.Derivatives of Sigmoid Function
def dsig(s):
    das = (s)*(1-s)
    return das

In [20]:
ds=dsig(s)
print(ds)

[[0.13751118 0.22799484 0.23334439]
 [0.18989606 0.20846995 0.08271541]]


In [None]:
#derivative of the tanh function is used in backpropagation during the training of neural networks.
#It helps propagate gradients backward through the network to adjust the weights and biases during 
#training.
#In some contexts, the tanh function and its derivative can be advantageous over the sigmoid function
#because the tanh function is zero-centered and can produce both positive and negative outputs. This 
#can lead to more balanced gradients during training and potentially faster convergence.

In [66]:
#2.Derivatives of tanh Function
def dthan(s):
    dat = (1-s**2)
    return dat

In [68]:
dt=dthan(t)
print(t)

[[ 0.92525207 -0.5453623  -0.48398233]
 [-0.79057703  0.69903334 -0.98015695]]


In [None]:
#The derivative of ReLU is 1 for positive inputs and 0 for non-positive inputs. This line uses a 
#comparison operation (s > 0) to create a boolean array where each element indicates whether the 
#corresponding element in the input s is greater than 0. The np.int64 function is used to convert 
#the boolean array into an integer array where True becomes 1 and False becomes 0.

#when the input is non-positive, the derivative is 0, causing the gradient to vanish. This behavior
#can lead to "dead" neurons that don't update during training, which is a challenge associated with 
#ReLU.

In [24]:
#3.Derivatives of Relu Function
def drelu(s):
    dar=(np.int64(s>0))
    return dar

In [27]:
dr = drelu(r)
print(dr)

[[1 0 0]
 [0 1 0]]


In [None]:
#dlrelu that takes two arguments: s, which is the output of the Leaky ReLU function, and alpha, 
#which is a parameter that determines the slope of the function for negative inputs. By default, 
#alpha is set to 0.01.

# Derivatives of leaky Relu is 1 for positive inputs and alpha for non-positive inputs.

In [40]:
#4.Derivatives of leaky Relu Function
#The np.where function is used to create an array where each element is 1 if the corresponding 
#element in the input s is greater than 0, and alpha if the element is non-positive.

def dlrelu(s,alpha=0.01):
    dal=np.where(s >0, 1, alpha)
    return dal

In [42]:
dler=dlrelu(ler,0.01)
print(dler)

array([[1., 1., 1.],
       [1., 1., 1.]])

Basic NN Model (layers-2,4 Nodes in each Layers,1 out put layers)


In [23]:
##1.Input X and Y
np.random.seed(1)
X=np.random.randn(2,3)
Y=(np.random.randn(1,3)>0) #generates a 1x3 matrix of random numbers, converts these values into a
#boolean matrix by checking if each value is greater than 0
#    Y is a NumPy array with shape (1, 3), where:
        #The first dimension (rows) corresponds to the number of classes (binary in this case).
        #The second dimension (columns) corresponds to the number of samples.
#In a binary classification task, such as this one, you typically have only one class label for 
#each sample

In [24]:
print(X)  #nx=2 and m=3
print()
print(X.shape)
print()
print(Y)

[[ 1.62434536 -0.61175641 -0.52817175]
 [-1.07296862  0.86540763 -2.3015387 ]]

(2, 3)

[[ True False  True]]


In [25]:
##2.Define shape
#three arguments: X, which represents the input data, Y, which represents the labels, and layers, 
#which is the number of units in the hidden layer.

def layer_sizes(X, Y,layers):
    n_x=X.shape[0]
    n_h=layers
    n_y=Y.shape[0]
    return (n_x,n_h,n_y)

#returns a tuple containing the calculated values of n_x, n_h, and n_y, which correspond to the 
#number of features, number of units in the hidden layer, and number of classes (labels

In [26]:
n_x,n_h,n_y = layer_sizes(X, Y, 4)

In [27]:
print(n_x,n_h,n_y)

2 4 1


In [56]:
##3. Initialize the parameters
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(2) # we set up a seed so that your output matches ours although the initialization is random.
    
    W1 = np.random.randn(n_h,n_x)*0.01  ##n_h=4,n_x=2
    print(W1.shape)
    b1 = np.zeros((n_h,1))          ##n_h=4
    W2 = np.random.randn(n_y,n_h)*0.01   ##n_y=1,n_h=4
    b2 = np.zeros((n_y,1))            ##n_y=1

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
   
    return parameters


In [57]:
parameters=initialize_parameters(n_x, n_h, n_y)
print(parameters) 

#initialized weight matrices and bias vectors for both layers of the neural network.
#These parameters will be used as the initial starting point for training the neural network using 
#optimization algorithms like gradient descent.

(4, 2)
{'W1': array([[-0.00416758, -0.00056267],
       [-0.02136196,  0.01640271],
       [-0.01793436, -0.00841747],
       [ 0.00502881, -0.01245288]]), 'b1': array([[0.],
       [0.],
       [0.],
       [0.]]), 'W2': array([[-0.01057952, -0.00909008,  0.00551454,  0.02292208]]), 'b2': array([[0.]])}


In [61]:
##4. Forward Propagation
#two arguments: X, which represents the input data, and parameters, which is a dictionary containing 
#the weight matrices and bias vectors for both layers of the neural network.

#extract the weight matrices and bias vectors for both layers from the parameters dictionary.
def forward_propagation(X, parameters):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    Z1 = np.dot(W1,X)+b1 #Z1 is the linear transformation of the input X using the weights W1 and bias b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2,A1)+b2 #linear transformation of the hidden layer activation A1 using the weights W2 and bias b2
    A2 = sigmoid(Z2)

#store intermediate values computed during forward propagation.
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache
#returns the final output A2 (the predicted values) and intermediate value
#These intermediate values are essential for backpropagation during the training process.

In [62]:
A2, cache=forward_propagation(X, parameters)
print(A2, cache)

[[0.5002307  0.49985831 0.50023963]] {'Z1': array([[-0.00616586,  0.00206261,  0.0034962 ],
       [-0.05229879,  0.02726335, -0.02646868],
       [-0.0200999 ,  0.00368691,  0.02884556],
       [ 0.02153008, -0.01385323,  0.02600471]]), 'A1': array([[-0.00616578,  0.0020626 ,  0.00349619],
       [-0.05225116,  0.02725659, -0.0264625 ],
       [-0.0200972 ,  0.00368689,  0.02883756],
       [ 0.02152676, -0.01385234,  0.02599885]]), 'Z2': array([[ 0.00092281, -0.00056678,  0.00095853]]), 'A2': array([[0.5002307 , 0.49985831, 0.50023963]])}


In [63]:
##5.Compute Cost
#calculates the cost (loss) of the neural network's predictions compared to the actual labels.
def compute_cost(A2, Y, parameters):
    m=Y.shape[1]     #training examples
    logprobs = np.multiply(np.log(A2),Y)+np.multiply(np.log(1-A2),(1-Y))
    cost = -np.sum(logprobs)/m
    cost = float(np.squeeze(cost)) #converts the cost value to a scalar by using the np.squeeze function to remove any unnecessary dimensions. 
    return cost

#This cost value represents how well the network's predictions match the true labels, and it is 
#used as a measure of how well the network is performing during training.

#negative sign in the NLL formula makes it convenient for optimization algorithms. Most optimization 
#algorithms are designed to minimize functions, so the negative sign transforms the problem into a 
#minimization task.

In [None]:
#cost=−m1​∑i=1m​(yi​log(y^​i​)+(1−yi​)log(1−y^​i​))

In [64]:
compute_cost(A2, Y, parameters)

0.6927392477233995

In [73]:
##6. Backward Propagation
#extract the weight matrices and bias vectors
#extract the activation values A1 and A2 from the cache dictionary

def BWP(parameters,cache,X,Y):
    W1=parameters["W1"]
    b1=parameters["b1"]
    W2=parameters["W2"]
    b2=parameters["b2"]
    A1=cache["A1"]
    A2=cache["A2"]

    m=Y.size   #number of samples in the dataset by accessing the total number of elements in the Y array.

#derivative of the cost with respect to Z2, which is the difference between the predicted output A2 
#and the actual labels Y
    dZ2=A2-Y
    dW2=np.dot(dZ2,A1.T)/m  #gradient of the weights
    db2=np.sum(dZ2,axis=1,keepdims=True)/m    #gradient of the bias
    dA1=np.dot(W2.T,dZ2)
    dZ1=dA1*(1-np.power(A1,2))
    dW1=np.dot(dZ1,X.T)/m
    db1=np.sum(dZ1,axis=1,keepdims=True)/m

    grades={"dW2":dW2,"db2":db2,"dW1":dW1,"db1":db1}
    #contains the calculated gradients for the weights and biases of both layers.
    return grades

#gradients are crucial for updating the parameters during the optimization process, such as gradient 
#descent.

In [75]:
grades=BWP(parameters,cache,X,Y)
print(grades)

{'dW2': array([[ 0.00078841,  0.01765429, -0.00084166, -0.01022527]]), 'db2': array([[-0.16655712]]), 'dW1': array([[ 0.00301023, -0.00747267],
       [ 0.00257968, -0.00641288],
       [-0.00156892,  0.003893  ],
       [-0.00652037,  0.01618243]]), 'db1': array([[ 0.00176201],
       [ 0.00150995],
       [-0.00091736],
       [-0.00381422]])}


In [76]:
##7.Update Grades
def update(parameters,grades,lr=0.01):
    W1=parameters["W1"]
    b1=parameters["b1"]
    W2=parameters["W2"]
    b2=parameters["b2"]

    dW1=grades["dW1"]
    db1=grades["db1"]
    dW2=grades["dW2"]
    db2=grades["db2"]


    W1=W1-lr*dW1  #core step of the optimization process, here the parameters are adjusted to minimize the cost function.
    b1=b1-lr*db1
    W2=W2-lr*dW2
    b2=b2-lr*db2

    parameters={"W1":W1,"b1":b1,"W2":W2,"b2":b2}

    return parameters

In [78]:
parameters=update(parameters,grades,lr=0.01)
print(parameters)

{'W1': array([[-0.00419768, -0.00048794],
       [-0.02138776,  0.01646684],
       [-0.01791867, -0.0084564 ],
       [ 0.00509402, -0.01261471]]), 'b1': array([[-1.76201370e-05],
       [-1.50994736e-05],
       [ 9.17363463e-06],
       [ 3.81421789e-05]]), 'W2': array([[-0.01058741, -0.00926662,  0.00552296,  0.02302433]]), 'b2': array([[0.00166557]])}


In [82]:
##8.creat own NN
def NN(X,Y,layers,itr=10000,print_cost=False):
    np.random.seed(3)
    n_x=layer_sizes(X, Y,layers)[0]
    n_y=layer_sizes(X, Y,layers)[2]
    n_h=layer_sizes(X, Y,layers)[1]

    parameters=initialize_parameters(n_x, n_h, n_y)

    for i in range (0,itr):
        A2, cache = forward_propagation(X, parameters)
        cost=compute_cost(A2, Y, parameters)
        grades=BWP(parameters,cache,X,Y)
        parameters=update(parameters,grades,lr=0.01)

        if print_cost and i%100==0:
            print("cost % i:%f" %(i,cost))

    return parameters

In [83]:
NN(X,Y,4,itr=1000,print_cost=True)

cost  0:0.692739
cost  100:0.669215
cost  200:0.649677
cost  300:0.619729
cost  400:0.555289
cost  500:0.447120
cost  600:0.328614
cost  700:0.234149
cost  800:0.169726
cost  900:0.127562


{'W1': array([[-0.21830665,  0.43559748],
        [-0.38633057,  0.73732683],
        [ 0.11624791, -0.28270074],
        [ 0.39639168, -0.77368548]]),
 'b1': array([[ 0.03393657],
        [ 0.09586648],
        [-0.01517238],
        [-0.10360464]]),
 'W2': array([[-0.52988031, -1.04262947,  0.31610115,  1.11019047]]),
 'b2': array([[0.34409584]])}

In [None]:
#The vanishing gradient problem is a challenge that arises during the training of deep neural networks,
#particularly when using gradient-based optimization algorithms like backpropagation. It occurs when 
#the gradients of the loss function with respect to the network's parameters (weights and biases) 
#become extremely small as they are propagated backward through the layers of the network. As a result,
#the weights of the earlier layers receive very small updates during training, leading to slow 
#convergence or even stagnation in learning.
#Sigmoid and tanh activation functions are susceptible to the vanishing gradient problem because:

#In the sigmoid function, the gradient approaches zero as the input becomes very large or very small.
 #In the tanh function, the gradient also becomes small for large positive and negative inputs.

#This can result in the early layers of the network learning very slowly or not learning at all. 
#As a consequence, the network's ability to capture complex patterns in the data decreases, limiting 
#its overall performance.