**Do install Follwoing libraries (if not already installed ) for smooth working of code**

In [1]:
# !pip install sklearn pandas numpy tqdm




[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


### Import required libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston , load_iris, load_digits
from sklearn.preprocessing import Normalizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm import trange 

## Question 1 

1. Matrix Multiplication Layer
2. Bias Addition Layer 
3. Mean Squared loss layer 
4. Softmax Activation 
5. Sigmoid Activation 
5. Cross Entropy Loss Layer 
6. Linear Activation 
7. tanh Activation 
8. ReLU Activation 

**1. Matrix Multiplication Layer**

In [3]:
class MultiplicationLayer : 
    """
    Inputs : X in R^(1xd) , W in R^(dxK)
    This layer takes X & W as input and perform these 2 tasks: 
    1. Forward Pass : Matrix multiplication,  Z = XW 
    2. Backward Pass : dZ/dX , dZ/dW 
    """
    def __init__(self, X, W) : 
        self.X = X 
        self.W = W 

    def __str__(self,):
        return " An instance of Muliplication Layer."

    def forward(self):  
        self.Z = np.dot(self.X, self.W)

    def backward(self):
        self.dZ_dW = (self.X).T  # dZ/dW 
        self.dZ_daZ_prev = self.W  # dZ/dX 

---

**2 Bias Addition Layer**

In [4]:
class BiasAdditionLayer : 
    """
    Inputs : Z in R^(1xK), B in R^(1xK)
    This layer takes output Z of forward pass of Multiplication Layer as input and perform these 2 operations : 
    1. Forward Pass :  Z = Z + B
    2. Backward Pass : dZ/dB
    """
    def __init__(self, Z : np.ndarray , bias : np.ndarray ):
        self.B = bias
        self.Z = Z
    
    def __str__(self,):
        return "An instance of Bias Addition Layer."
    
    def forward(self,):
        self.Z = self.Z + self.B
    
    def backward(self,):
        self.dZ_dB = np.identity( self.B.shape[1] )

---

**3. Mean Squared Loss Layer**

In [5]:
class MeanSquaredLossLayer : 
    """
    This layer implements Mean Square Loss Layer.
    Inputs : Y in R^(1xK) , Y_hat in R^(1xK)  where K --> dimesion of output layer 
    This layer takes prediction Y_hat and true Y as input and perform these 2 opearations : 
    1. Forward Pass : L = (1/n) * || Y_hat - Y||**2 
    2. Backward Pass : dL/dY_hat = (2/n)*(Y_hat - Y).T   Note :Here instead of dL/dY_hat , I used dL/daZ symbol which denote 
                                                             derivative of loss w.r.t. output of previous activation layer
    """
    def __init__(self, Y : np.ndarray , Y_hat : np.ndarray):
        self.Y = Y 
        self.aZ = Y_hat 
    
    def __str__(self,):
        return "An instance of Mean Squared Loss Layer"
    
    def forward(self, ):
        self.L = np.mean( ( self.aZ - self.Y)**2 )
        
    def backward(self,):
        self.dL_daZ = (2/len(self.Y))*(self.aZ - self.Y).T      

---

**4. Soft Max Activation**

In [6]:
class SoftMaxActivation : 
    """
    This layer implements SoftMax Activation Function.
    Input : a numpy array Z in R^(1XK)  
    1. Forward Pass : Apply Softmax Activation function, aZ = softmax(Z).T
    2. Backward Pass : daZ/dZ  = diag(aZ) - sZ*transpose(aZ)  --> here diag(aZ) is diagonal matrix with 
                                                                   i-th diagnoal entry replaced by sZ_i value
    """
    def __init__(self, Z):
        self.Z = Z 
        
    def __str__(self,):
        return "An instance of Softmax Activation Layer"
        
    def forward(self,):
        self.aZ = self.softmax(self.Z)
    
    def backward(self,):
        self.daZ_dZ = np.diag( self.aZ.reshape(-1) ) - (self.aZ.T)@( (self.aZ))  # Shape = (K,K) where K = len( sZ )
    
    @staticmethod
    def softmax(Z : np.ndarray):
        max_Z = np.max( Z, axis=1 ,keepdims=True )
        return (np.exp(Z - max_Z ))/np.sum( np.exp(Z - max_Z), axis=1 , keepdims=True)

---

**5. Sigmoid Activation**

In [7]:
class SigmoidActivation :
    """
    This layer implements Sigmoid Activation Function. 
    Input : a numpy array Z of shape Kx1 
    1. Forward Pass : aZ = sigmoid( Z )  
    2. Backward Pass : daZ/dZ = diagonal matrix with entries aZ_i*(1-aZ_i) --> sigZ_i means i-th component of sigZ
    """
    
    def __init__(self,Z ):
        self.Z = Z 
    
    def __str__(self,):
        return "An instance of Sigmoid Activation Layer"
    
    def forward(self,):
        self.aZ = self.sigmoid( self.Z )  # sigmoid calculation
    
    def backward(self,):
        diag_entries = np.multiply(self.aZ, 1-self.aZ).reshape(-1)
        self.daZ_dZ = np.diag(diag_entries) 
    
    @staticmethod
    def sigmoid( Z : np.ndarray ) :
        return  1./(1 + np.exp(-Z) )

---

**6. Cross Entropy Loss Layer**

In [8]:
class CrossEntropyLossLayer : 
    """
    This layer implements Cross Entropy Loss Layer. 
    Inputs : Y in R^(1xK) , Y_pred in R^(1xK)  where K --> dimesion of output layer 
    This layer takes prediction Y_pred and true Y as input and perform these 2 opearations : 
    1. Forward Pass : L = -1 * dot product of Y & log(Y_pred)    
    2. Backward Pass : dL/dY_pred in R^(Kx1)
    """    
    def __init__(self, Y , Y_pred): 
        self.Y = Y
        self.aZ = Y_pred
        self.epsilon = 1e-40  
        
    
    def __str__(self, ):
        return "An instance of Cross Entropy Loss Layer"
    
    def forward(self, ):
        self.L = - np.sum( self.Y * np.log(self.aZ+self.epsilon) )
        
    def backward(self, ):
        self.dL_daZ = -1*(self.Y/(self.aZ + self.epsilon)).T # Element wise division

**7. Linear Activation**

In [9]:
class LinearActivation : 
    """
    Implementation of linear activation function.
    Input : Z in R^(1xn)
    Ouput : linear(Z) = Z 
    """
    def __init__(self, Z):
        self.Z = Z 
        
    def __str__(self,):
        return "An instance of Linear Activation."
    
    def forward(self, ):
        self.aZ = self.Z 
    
    def backward(self,):
        self.daZ_dZ = np.identity( self.Z.shape[1] )

**8. tanh Activation**

In [10]:
class tanhActivation : 
    """
    Implementation of tanh activation function
    Input : a numpy array Z in R^(1xK)
    1. Forward Pass : aZ = tanh(Z)
    2. Backward Pass : daZ/dZ = np.diag(1 - aZ**2)   --> R^(KxK)
    """
    def __init__(self, Z): 
        self.Z = Z 
        
    def __str__(self,): 
        return "An instance of tanhActivation class."
    
    def forward(self,): 
        self.aZ = np.tanh(self.Z)
    
    def backward(self,): 
        self.daZ_dZ = np.diag(1 - self.aZ.reshape(-1)**2)

**9. ReLUActivation**

In [11]:
class ReLUActivation : 
    """
    Implementation of relu activatino function
    Input : a numpy array Z in R^(1xK)
    1. Forward Pass aZ = max(Z,0)
    2. Backward Pass : daZ_dZ = diag_matrix( 1 if aZ_i>0 else 0 )
    """
    def __init__(self, Z): 
        self.Z = Z 
        self.Leak = 0.01
    
    def __str__(self,):
        return "An instance of ReLU activation"
    
    def forward(self,):
        self.aZ = np.maximum(self.Z,0)
    
    def backward(self,):
        self.daZ_dZ = np.diag( [1. if x>=0 else self.Leak for x in self.aZ.reshape(-1)])
    

## Question 2 & 3 

- Question 2 :  Boston House Price Prediction 
- Question 3 -- MNIST Hand Written Digit Classification 

**Load Data and Train Test Split**

In [12]:
def load_data(dataset_name='boston', 
             normalize_X=False, 
             normalize_y=False,
             one_hot_encode_y = False, 
             test_size=0.2):
    if dataset_name == 'boston' : 
        data = load_boston()
    elif dataset_name == 'iris' : 
        data = load_iris()
    elif dataset_name == 'mnist':
        data = load_digits()
        data['data'] = 1*(data['data']>=8)

    X = data['data']
    y = data['target'].reshape(-1,1)
    
    if normalize_X == True : 
        normalizer = Normalizer()
        X  = normalizer.fit_transform(X)
    
    if normalize_y == True : 
        normalizer = Normalizer()
        y = normalizer.fit_transform(y)
    
    if one_hot_encode_y == True : 
        encoder = OneHotEncoder()
        y = encoder.fit_transform(y).toarray()
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    return X_train, y_train, X_test, y_test

**Stochastic Gradient Descent ( SGD )**

In [13]:

def forward_pass(X_sample, Y_sample, W, B, activation='linear', loss='mean_squared'):
    multiply_layer = MultiplicationLayer(X_sample, W)
    multiply_layer.forward()

    bias_add_layer = BiasAdditionLayer(multiply_layer.Z, B)
    bias_add_layer.forward()

    if activation == 'linear' : 
        activation_layer = LinearActivation(bias_add_layer.Z)
    elif activation == 'softmax': 
        activation_layer = SoftMaxActivation(bias_add_layer.Z)
    activation_layer.forward()
    
    if loss == 'mean_squared' :
        loss_layer = MeanSquaredLossLayer(Y_sample, activation_layer.aZ )
    elif loss=='cross_entropy' : 
        loss_layer = CrossEntropyLossLayer(Y_sample, activation_layer.aZ )
    loss_layer.forward()
    
    
    return multiply_layer, bias_add_layer, activation_layer, loss_layer

def backward_pass(multiply_layer, bias_add_layer, activation_layer, loss_layer): 

    loss_layer.backward()
    activation_layer.backward()
    bias_add_layer.backward()
    multiply_layer.backward()

    return loss_layer, activation_layer, bias_add_layer, multiply_layer 


def StochasticGradientDescent( X_train,
                               y_train, 
                               X_test, 
                               y_test, 
                               inp_shape = 1,   # dimension of input 
                               out_shape = 1,   # dimension of output  
                               n_iterations = 10000,
                               learning_rate = 0.01,
                               activation = 'linear',
                               loss = 'mean_squared',
                               seed = 42,
                               task='regression'  #  one of  [ 'regression', 'classification' ]
                            ):

    np.random.seed(seed)

    # initialize W & B 
    W_shape = ( inp_shape,  out_shape )
    B_shape = ( 1, out_shape )

    W = np.random.random(W_shape)
    B  = np.random.random(B_shape)

    iterations = trange(n_iterations ,desc="Training...", ncols=100)

    for iteration, _ in enumerate(iterations) : 
        randomIndx = np.random.randint( len(X_train) )
        X_sample = X_train[randomIndx, :].reshape(1, inp_shape)
        Y_sample = y_train[randomIndx, :].reshape(1, out_shape)

        # Forward Pass
        # 1) Z <-- XW 
        # 2) Z <-- Z + Bias
        # 3) Z <-- activation( Z ) 
        # 4) find Loss L 

        multiply_layer, bias_add_layer, activation_layer, loss_layer = forward_pass(X_sample, Y_sample, W, B, activation,loss)

        # Note : here whenever I write aZ it means it is output of some activation function applied on Z 

        # Backward Pass 
        # 1) dL/daZ 
        # 2) dL/dZ = dL/daZ* daZ/dZ 
        # 3) dL/dW = dZ/dW * dL/dZ 
        # 4) dL/dB = dZ/dB * dL/dB 
        
        loss_layer, activation_layer, bias_add_layer, multiply_layer = backward_pass(multiply_layer, bias_add_layer, activation_layer, loss_layer)
        
        dL_daZ = loss_layer.dL_daZ 
        dL_dZ = np.dot( activation_layer.daZ_dZ, dL_daZ ) 
        dL_dW = np.dot( multiply_layer.dZ_dW , dL_dZ.T)
        dL_dB = np.dot( bias_add_layer.dZ_dB, dL_dZ).T

        # Update W & B 
        W -=  learning_rate*dL_dW 
        B -=  learning_rate*dL_dB
        
        if iteration%1000 == 0 : 
            iterations.set_description( "Sample Error : %0.5f"%loss_layer.L, refresh=True )
    
    # Lets run forward pass for train and test data and check accuracy/error


    if task =='regression':
        if isinstance(loss_layer, MeanSquaredLossLayer) : 
            _ , _, _,  loss_layer = forward_pass( X_train, y_train , W, B, activation, loss)
            print("Mean Squared Loss Error (Train Data)  : %0.5f"% loss_layer.L)
                        
            _ , _, _,  loss_layer = forward_pass( X_test, y_test , W, B, activation, loss)
            print("Mean Squared Loss error (Test Data) : %0.5f"%loss_layer.L)
    
    if task =='classification': 
        if isinstance(loss_layer, CrossEntropyLossLayer): 
            y_true = np.argmax(y_train, axis=1)
            _, _, _, loss_layer = forward_pass( X_train, y_train , W, B, activation, loss)
            y_pred = np.argmax( loss_layer.aZ, axis=1)

            acc = 1*(y_pred == y_true)
            print("Classification Accuracy (Training Data ): {0}/{1} = {2} %".format(sum(acc), len(acc), sum(acc)*100/len(acc)))

            y_true = np.argmax(y_test,axis=1)
            _, _, _, loss_layer = forward_pass( X_test, y_test , W, B, activation, loss)
            y_pred = np.argmax( loss_layer.aZ, axis=1)

            acc = 1*(y_pred == y_true)
            print("Classification Accuracy (Testing Data ): {0}/{1} = {2} %".format(sum(acc), len(acc), sum(acc)*100/len(acc)))



---

### Question 2

In [14]:
X_train, y_train, X_test, y_test  = load_data('boston', normalize_X=True, test_size=0.2)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [15]:
StochasticGradientDescent(X_train, y_train, X_test, y_test, inp_shape=X_train.shape[1], out_shape=y_train.shape[1], task='regression')

Sample Error : 59.10494: 100%|█████████████████████████████| 10000/10000 [00:00<00:00, 21034.62it/s]

Mean Squared Loss Error (Train Data)  : 62.92856
Mean Squared Loss error (Test Data) : 49.46550





### Question 3

In [16]:
X_train, y_train, X_test, y_test = load_data('iris',normalize_X=True, one_hot_encode_y=True)

In [17]:
StochasticGradientDescent(X_train,y_train,X_test,y_test, inp_shape=X_train.shape[1], \
                          out_shape=y_train.shape[1], 
                          n_iterations=5000,
                          learning_rate=0.001,
                          activation='softmax',
                          task='classification',
                          loss='cross_entropy')

Sample Error : 0.89642: 100%|████████████████████████████████| 5000/5000 [00:00<00:00, 13051.19it/s]

Classification Accuracy (Training Data ): 79/120 = 65.83333333333333 %
Classification Accuracy (Testing Data ): 21/30 = 70.0 %



