<a href="https://colab.research.google.com/github/Andre6o6/mlcourse-2019/blob/master/Task3/nn_modules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

In [0]:
class Module(object):
    """
    Forward:
        
                    input -> [module] -> output

    Backward:

        grad(w.r.t.)Input <- [module] <- grad(w.r.t.)Output
                                      ^- input

    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

In [0]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
        """
        x = input
        for i in range(len(self.modules)):
            x = self.modules[i].forward(x)
        self.output = x
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)        
        """
        g = gradOutput
        for i in reversed(range(len(self.modules))):
            y_prev = input if i==0 else self.modules[i-1].output
            g = self.modules[i].backward(y_prev, g)
        self.gradInput = g
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()
    
    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

In [0]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # Xavier initialization (but with uniform distribution(?))
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_in, n_out))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):        
        self.output = np.matmul(input, self.W) + self.b
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.matmul(gradOutput, self.W.T)
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradW = np.matmul(input.T, gradOutput)
        self.gradb = np.sum(gradOutput, axis=0)
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[0],s[1])
        return q

In [0]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        shiftx = np.subtract(input, input.max(axis=1, keepdims=True))
        exps = np.exp(shiftx)
        self.output = exps / np.sum(exps, axis=1, keepdims=True)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        b,n = gradOutput.shape
        self.gradInput = np.zeros_like(self.output)
        for i in range(b):
            jacobian = self.output[i].reshape(-1,1) * (np.eye(n) - self.output[i])
            self.gradInput[i] = np.matmul(gradOutput[i], jacobian)
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

In [0]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha=0.1):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha
        self.moving_mean = 0 
        self.moving_variance = 0
        
    def updateOutput(self, input):
        if (self.training):
            batch_mean = np.mean(input, axis=0)
            batch_variance = np.var(input, axis=0)

            self.moving_mean = self.moving_mean * self.alpha + batch_mean * (1 - self.alpha)
            self.moving_variance = self.moving_variance * self.alpha + batch_variance * (1 - self.alpha)

            self.output = (input - batch_mean) / np.sqrt(batch_variance + self.EPS)
        else:
            self.output = (input - self.moving_mean) / np.sqrt(self.moving_variance + self.EPS)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        mu = np.mean(input, axis=0)
        var = np.var(input, axis=0)
        
        B = input.shape[0]        #TODO check dims
        self.gradInput = 1 / np.sqrt(var + self.EPS) * (gradOutput - 1/B * np.sum(gradOutput, axis=0) \
                                                        - 1/B * (input - mu) / (var + self.EPS) * np.sum(gradOutput * (input - mu), axis=0))
        return self.gradInput
    
    def __repr__(self):
        return "BatchNormalization"

In [0]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)
        
        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output
        
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)
    
    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)
        
    def getParameters(self):
        return [self.gamma, self.beta]
    
    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]
    
    def __repr__(self):
        return "ChannelwiseScaling"

In [0]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        
        self.p = p
        self.mask = None
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

In [0]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

In [0]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

In [0]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def updateOutput(self, input, target):   
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

In [0]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        shiftx = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = shiftx - np.log(np.sum(np.exp(shiftx), axis=1, keepdims=True))
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        shiftx = np.subtract(input, input.max(axis=1, keepdims=True))
        exps = np.exp(shiftx)
        softmax = exps / np.sum(exps, axis=1, keepdims=True)
        self.gradInput = gradOutput - softmax * np.sum(gradOutput, axis=1, keepdims=True)
        return self.gradInput
    
    def __repr__(self):
        return "LogSoftMax"

In [0]:
class NLLCriterion(Criterion):
    def __init__(self):
        a = super(NLLCriterion, self)
        super(NLLCriterion, self).__init__()
        
    def updateOutput(self, input, target):
        B = input.shape[0]
        loss = - input[range(B), target]
        self.output = np.sum(loss) / B
        return self.output

    def updateGradInput(self, input, target):
        B = input.shape[0]
        self.gradInput = np.zeros_like(input)
        self.gradInput[range(B), target] = - 1/B
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

In [0]:
class CrossEntropyLoss(Criterion):
    """
    Combines LogSoftmax and NLLLoss in one single class.
    """
    def __init__(self):
        super(CrossEntropyLoss, self).__init__()
        
    def updateOutput(self, input, target):   
        """
        Expects labels as targets, not onehot vectors
        """
        shiftx = np.subtract(input, input.max(axis=1, keepdims=True))
        loss = - shiftx[range(shiftx.shape[0]), target] + np.log(np.sum(np.exp(shiftx), axis=1))
        self.output = np.sum(loss) / input.shape[0]
        return self.output
 
    def updateGradInput(self, input, target):
        shiftx = np.subtract(input, input.max(axis=1, keepdims=True))
        exps = np.exp(shiftx)
        softmax = exps / np.sum(exps, axis=1, keepdims=True)
        softmax[range(input.shape[0]), target] -= 1

        self.gradInput = softmax / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "CrossEntropyLoss"